perf_event.c 56.3 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2
 * Performance events x86 architecture code
I
Ingo Molnar 已提交
3
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
I
Ingo Molnar 已提交
10 11 12 13
 *
 *  For licencing details see kernel-base/COPYING
 */

14
#include <linux/perf_event.h>
I
Ingo Molnar 已提交
15 16 17 18
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
19
#include <linux/module.h>
I
Ingo Molnar 已提交
20 21
#include <linux/kdebug.h>
#include <linux/sched.h>
22
#include <linux/uaccess.h>
23
#include <linux/highmem.h>
24
#include <linux/cpu.h>
I
Ingo Molnar 已提交
25 26

#include <asm/apic.h>
27
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
28
#include <asm/nmi.h>
I
Ingo Molnar 已提交
29

30
static u64 perf_event_mask __read_mostly;
31

32 33
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS	4
34 35 36 37 38

/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE		24

/* The size of a per-cpu BTS buffer in bytes: */
39
#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 2048)
40 41

/* The BTS overflow threshold in bytes from the end of the buffer: */
42
#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 128)
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67


/*
 * Bits in the debugctlmsr controlling branch tracing.
 */
#define X86_DEBUGCTL_TR			(1 << 6)
#define X86_DEBUGCTL_BTS		(1 << 7)
#define X86_DEBUGCTL_BTINT		(1 << 8)
#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)

/*
 * A debug store configuration.
 *
 * We only support architectures that use 64bit fields.
 */
struct debug_store {
	u64	bts_buffer_base;
	u64	bts_index;
	u64	bts_absolute_maximum;
	u64	bts_interrupt_threshold;
	u64	pebs_buffer_base;
	u64	pebs_index;
	u64	pebs_absolute_maximum;
	u64	pebs_interrupt_threshold;
68
	u64	pebs_event_reset[MAX_PEBS_EVENTS];
69 70
};

71 72
struct cpu_hw_events {
	struct perf_event	*events[X86_PMC_IDX_MAX];
73 74
	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
75
	unsigned long		interrupts;
76
	int			enabled;
77
	struct debug_store	*ds;
I
Ingo Molnar 已提交
78 79
};

80 81 82 83 84 85 86 87 88 89 90 91
struct event_constraint {
	unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
	int		code;
};

#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
#define EVENT_CONSTRAINT_END  { .code = 0, .idxmsk[0] = 0 }

#define for_each_event_constraint(e, c) \
	for ((e) = (c); (e)->idxmsk[0]; (e)++)


I
Ingo Molnar 已提交
92
/*
93
 * struct x86_pmu - generic x86 pmu
I
Ingo Molnar 已提交
94
 */
95
struct x86_pmu {
96 97
	const char	*name;
	int		version;
98
	int		(*handle_irq)(struct pt_regs *);
99 100
	void		(*disable_all)(void);
	void		(*enable_all)(void);
101 102
	void		(*enable)(struct hw_perf_event *, int);
	void		(*disable)(struct hw_perf_event *, int);
103 104
	unsigned	eventsel;
	unsigned	perfctr;
105 106
	u64		(*event_map)(int);
	u64		(*raw_event)(u64);
107
	int		max_events;
108 109 110 111
	int		num_events;
	int		num_events_fixed;
	int		event_bits;
	u64		event_mask;
112
	int		apic;
113
	u64		max_period;
114
	u64		intel_ctrl;
115 116
	void		(*enable_bts)(u64 config);
	void		(*disable_bts)(void);
117 118
	int		(*get_event_idx)(struct cpu_hw_events *cpuc,
					 struct hw_perf_event *hwc);
119 120
};

121
static struct x86_pmu x86_pmu __read_mostly;
122

123
static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
124 125
	.enabled = 1,
};
I
Ingo Molnar 已提交
126

127
static const struct event_constraint *event_constraints;
128

V
Vince Weaver 已提交
129 130 131 132 133 134 135
/*
 * Not sure about some of these
 */
static const u64 p6_perfmon_event_map[] =
{
  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0079,
  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
136 137
  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0f2e,
  [PERF_COUNT_HW_CACHE_MISSES]		= 0x012e,
V
Vince Weaver 已提交
138 139 140 141 142
  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
  [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,
};

143
static u64 p6_pmu_event_map(int hw_event)
V
Vince Weaver 已提交
144
{
145
	return p6_perfmon_event_map[hw_event];
V
Vince Weaver 已提交
146 147
}

148
/*
149
 * Event setting that is specified not to count anything.
150 151 152 153
 * We use this to effectively disable a counter.
 *
 * L2_RQSTS with 0 MESI unit mask.
 */
154
#define P6_NOP_EVENT			0x0000002EULL
155

156
static u64 p6_pmu_raw_event(u64 hw_event)
V
Vince Weaver 已提交
157 158 159 160 161
{
#define P6_EVNTSEL_EVENT_MASK		0x000000FFULL
#define P6_EVNTSEL_UNIT_MASK		0x0000FF00ULL
#define P6_EVNTSEL_EDGE_MASK		0x00040000ULL
#define P6_EVNTSEL_INV_MASK		0x00800000ULL
162
#define P6_EVNTSEL_REG_MASK		0xFF000000ULL
V
Vince Weaver 已提交
163 164 165 166 167 168

#define P6_EVNTSEL_MASK			\
	(P6_EVNTSEL_EVENT_MASK |	\
	 P6_EVNTSEL_UNIT_MASK  |	\
	 P6_EVNTSEL_EDGE_MASK  |	\
	 P6_EVNTSEL_INV_MASK   |	\
169
	 P6_EVNTSEL_REG_MASK)
V
Vince Weaver 已提交
170

171
	return hw_event & P6_EVNTSEL_MASK;
V
Vince Weaver 已提交
172 173
}

174 175 176 177 178 179 180 181 182 183
static const struct event_constraint intel_p6_event_constraints[] =
{
	EVENT_CONSTRAINT(0xc1, 0x1),	/* FLOPS */
	EVENT_CONSTRAINT(0x10, 0x1),	/* FP_COMP_OPS_EXE */
	EVENT_CONSTRAINT(0x11, 0x1),	/* FP_ASSIST */
	EVENT_CONSTRAINT(0x12, 0x2),	/* MUL */
	EVENT_CONSTRAINT(0x13, 0x2),	/* DIV */
	EVENT_CONSTRAINT(0x14, 0x1),	/* CYCLES_DIV_BUSY */
	EVENT_CONSTRAINT_END
};
V
Vince Weaver 已提交
184

185 186 187
/*
 * Intel PerfMon v3. Used on Core2 and later.
 */
188
static const u64 intel_perfmon_event_map[] =
I
Ingo Molnar 已提交
189
{
190 191 192 193 194 195 196
  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
I
Ingo Molnar 已提交
197 198
};

199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
static const struct event_constraint intel_core_event_constraints[] =
{
	EVENT_CONSTRAINT(0x10, 0x1),	/* FP_COMP_OPS_EXE */
	EVENT_CONSTRAINT(0x11, 0x2),	/* FP_ASSIST */
	EVENT_CONSTRAINT(0x12, 0x2),	/* MUL */
	EVENT_CONSTRAINT(0x13, 0x2),	/* DIV */
	EVENT_CONSTRAINT(0x14, 0x1),	/* CYCLES_DIV_BUSY */
	EVENT_CONSTRAINT(0x18, 0x1),	/* IDLE_DURING_DIV */
	EVENT_CONSTRAINT(0x19, 0x2),	/* DELAYED_BYPASS */
	EVENT_CONSTRAINT(0xa1, 0x1),	/* RS_UOPS_DISPATCH_CYCLES */
	EVENT_CONSTRAINT(0xcb, 0x1),	/* MEM_LOAD_RETIRED */
	EVENT_CONSTRAINT_END
};

static const struct event_constraint intel_nehalem_event_constraints[] =
{
	EVENT_CONSTRAINT(0x40, 0x3),	/* L1D_CACHE_LD */
	EVENT_CONSTRAINT(0x41, 0x3),	/* L1D_CACHE_ST */
	EVENT_CONSTRAINT(0x42, 0x3),	/* L1D_CACHE_LOCK */
	EVENT_CONSTRAINT(0x43, 0x3),	/* L1D_ALL_REF */
	EVENT_CONSTRAINT(0x4e, 0x3),	/* L1D_PREFETCH */
	EVENT_CONSTRAINT(0x4c, 0x3),	/* LOAD_HIT_PRE */
	EVENT_CONSTRAINT(0x51, 0x3),	/* L1D */
	EVENT_CONSTRAINT(0x52, 0x3),	/* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
	EVENT_CONSTRAINT(0x53, 0x3),	/* L1D_CACHE_LOCK_FB_HIT */
	EVENT_CONSTRAINT(0xc5, 0x3),	/* CACHE_LOCK_CYCLES */
	EVENT_CONSTRAINT_END
};

228
static u64 intel_pmu_event_map(int hw_event)
229
{
230
	return intel_perfmon_event_map[hw_event];
231
}
I
Ingo Molnar 已提交
232

233
/*
234
 * Generalized hw caching related hw_event table, filled
235
 * in on a per model basis. A value of 0 means
236 237
 * 'not supported', -1 means 'hw_event makes no sense on
 * this CPU', any other value means the raw hw_event
238 239 240 241 242 243 244 245 246 247
 * ID.
 */

#define C(x) PERF_COUNT_HW_CACHE_##x

static u64 __read_mostly hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];

248
static __initconst u64 nehalem_hw_cache_event_ids
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
 [ C(L1D) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
	},
 },
 [ C(L1I ) ] = {
	[ C(OP_READ) ] = {
269
		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
270 271 272 273 274 275 276 277 278 279 280
		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0x0,
		[ C(RESULT_MISS)   ] = 0x0,
	},
 },
281
 [ C(LL  ) ] = {
282 283 284 285 286 287 288 289 290
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
	},
	[ C(OP_PREFETCH) ] = {
291 292
		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
	},
 },
 [ C(DTLB) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0x0,
		[ C(RESULT_MISS)   ] = 0x0,
	},
 },
 [ C(ITLB) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
312
		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
 [ C(BPU ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
};

339
static __initconst u64 core2_hw_cache_event_ids
340 341 342 343
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
 [ C(L1D) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
		[ C(RESULT_MISS)   ] = 0,
	},
 },
 [ C(L1I ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0,
		[ C(RESULT_MISS)   ] = 0,
	},
 },
372
 [ C(LL  ) ] = {
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0,
		[ C(RESULT_MISS)   ] = 0,
	},
 },
 [ C(DTLB) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0,
		[ C(RESULT_MISS)   ] = 0,
	},
 },
 [ C(ITLB) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
 [ C(BPU ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
428 429
};

430
static __initconst u64 atom_hw_cache_event_ids
431 432 433 434
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
435 436 437 438 439 440
 [ C(L1D) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
		[ C(RESULT_MISS)   ] = 0,
	},
	[ C(OP_WRITE) ] = {
441
		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
442 443 444 445 446 447 448 449 450
		[ C(RESULT_MISS)   ] = 0,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0x0,
		[ C(RESULT_MISS)   ] = 0,
	},
 },
 [ C(L1I ) ] = {
	[ C(OP_READ) ] = {
451 452
		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
453 454 455 456 457 458 459 460 461 462
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0,
		[ C(RESULT_MISS)   ] = 0,
	},
 },
463
 [ C(LL  ) ] = {
464 465 466 467 468 469 470 471 472 473 474 475 476 477 478
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0,
		[ C(RESULT_MISS)   ] = 0,
	},
 },
 [ C(DTLB) ] = {
	[ C(OP_READ) ] = {
479
		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
480 481 482
		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
	},
	[ C(OP_WRITE) ] = {
483
		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518
		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0,
		[ C(RESULT_MISS)   ] = 0,
	},
 },
 [ C(ITLB) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
 [ C(BPU ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
519 520
};

521
static u64 intel_pmu_raw_event(u64 hw_event)
522
{
523 524
#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
525 526
#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
527
#define CORE_EVNTSEL_REG_MASK		0xFF000000ULL
528

529
#define CORE_EVNTSEL_MASK		\
530 531
	(CORE_EVNTSEL_EVENT_MASK |	\
	 CORE_EVNTSEL_UNIT_MASK  |	\
532 533
	 CORE_EVNTSEL_EDGE_MASK  |	\
	 CORE_EVNTSEL_INV_MASK  |	\
534
	 CORE_EVNTSEL_REG_MASK)
535

536
	return hw_event & CORE_EVNTSEL_MASK;
537 538
}

539
static __initconst u64 amd_hw_cache_event_ids
540 541 542 543 544 545
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
 [ C(L1D) ] = {
	[ C(OP_READ) ] = {
546 547
		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
		[ C(RESULT_MISS)   ] = 0x0041, /* Data Cache Misses          */
548 549
	},
	[ C(OP_WRITE) ] = {
550
		[ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
551 552 553
		[ C(RESULT_MISS)   ] = 0,
	},
	[ C(OP_PREFETCH) ] = {
554 555
		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
556 557 558 559 560 561 562 563 564 565 566 567
	},
 },
 [ C(L1I ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
568
		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
569 570 571
		[ C(RESULT_MISS)   ] = 0,
	},
 },
572
 [ C(LL  ) ] = {
573
	[ C(OP_READ) ] = {
574 575
		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
576 577
	},
	[ C(OP_WRITE) ] = {
578
		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
579 580 581 582 583 584 585 586 587
		[ C(RESULT_MISS)   ] = 0,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0,
		[ C(RESULT_MISS)   ] = 0,
	},
 },
 [ C(DTLB) ] = {
	[ C(OP_READ) ] = {
588 589
		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
		[ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */
590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0,
		[ C(RESULT_MISS)   ] = 0,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0,
		[ C(RESULT_MISS)   ] = 0,
	},
 },
 [ C(ITLB) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
 [ C(BPU ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
};

630 631 632
/*
 * AMD Performance Monitor K7 and later.
 */
633
static const u64 amd_perfmon_event_map[] =
634
{
635 636 637 638 639 640
  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080,
  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081,
  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
641 642
};

643
static u64 amd_pmu_event_map(int hw_event)
644
{
645
	return amd_perfmon_event_map[hw_event];
646 647
}

648
static u64 amd_pmu_raw_event(u64 hw_event)
649
{
650 651
#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
652 653
#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
#define K7_EVNTSEL_INV_MASK	0x000800000ULL
654
#define K7_EVNTSEL_REG_MASK	0x0FF000000ULL
655 656 657 658

#define K7_EVNTSEL_MASK			\
	(K7_EVNTSEL_EVENT_MASK |	\
	 K7_EVNTSEL_UNIT_MASK  |	\
659 660
	 K7_EVNTSEL_EDGE_MASK  |	\
	 K7_EVNTSEL_INV_MASK   |	\
661
	 K7_EVNTSEL_REG_MASK)
662

663
	return hw_event & K7_EVNTSEL_MASK;
664 665
}

666
/*
667 668
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
669 670
 * Returns the delta events processed.
 */
671
static u64
672 673
x86_perf_event_update(struct perf_event *event,
			struct hw_perf_event *hwc, int idx)
674
{
675
	int shift = 64 - x86_pmu.event_bits;
676 677
	u64 prev_raw_count, new_raw_count;
	s64 delta;
678

679 680 681
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

682
	/*
683
	 * Careful: an NMI might modify the previous event value.
684 685 686
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
687
	 * count to the generic event atomically:
688 689 690
	 */
again:
	prev_raw_count = atomic64_read(&hwc->prev_count);
691
	rdmsrl(hwc->event_base + idx, new_raw_count);
692 693 694 695 696 697 698 699

	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
700
	 * (event-)time and add that to the generic event.
701 702
	 *
	 * Careful, not all hw sign-extends above the physical width
703
	 * of the count.
704
	 */
705 706
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
707

708
	atomic64_add(delta, &event->count);
709
	atomic64_sub(delta, &hwc->period_left);
710 711

	return new_raw_count;
712 713
}

714
static atomic_t active_events;
P
Peter Zijlstra 已提交
715 716 717 718
static DEFINE_MUTEX(pmc_reserve_mutex);

static bool reserve_pmc_hardware(void)
{
719
#ifdef CONFIG_X86_LOCAL_APIC
P
Peter Zijlstra 已提交
720 721 722 723 724
	int i;

	if (nmi_watchdog == NMI_LOCAL_APIC)
		disable_lapic_nmi_watchdog();

725
	for (i = 0; i < x86_pmu.num_events; i++) {
726
		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
P
Peter Zijlstra 已提交
727 728 729
			goto perfctr_fail;
	}

730
	for (i = 0; i < x86_pmu.num_events; i++) {
731
		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
P
Peter Zijlstra 已提交
732 733
			goto eventsel_fail;
	}
734
#endif
P
Peter Zijlstra 已提交
735 736 737

	return true;

738
#ifdef CONFIG_X86_LOCAL_APIC
P
Peter Zijlstra 已提交
739 740
eventsel_fail:
	for (i--; i >= 0; i--)
741
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
742

743
	i = x86_pmu.num_events;
P
Peter Zijlstra 已提交
744 745 746

perfctr_fail:
	for (i--; i >= 0; i--)
747
		release_perfctr_nmi(x86_pmu.perfctr + i);
P
Peter Zijlstra 已提交
748 749 750 751 752

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();

	return false;
753
#endif
P
Peter Zijlstra 已提交
754 755 756 757
}

static void release_pmc_hardware(void)
{
758
#ifdef CONFIG_X86_LOCAL_APIC
P
Peter Zijlstra 已提交
759 760
	int i;

761
	for (i = 0; i < x86_pmu.num_events; i++) {
762 763
		release_perfctr_nmi(x86_pmu.perfctr + i);
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
764 765 766 767
	}

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();
768
#endif
P
Peter Zijlstra 已提交
769 770
}

771 772 773 774 775 776 777
static inline bool bts_available(void)
{
	return x86_pmu.enable_bts != NULL;
}

static inline void init_debug_store_on_cpu(int cpu)
{
778
	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
779 780 781 782 783

	if (!ds)
		return;

	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
784 785
		     (u32)((u64)(unsigned long)ds),
		     (u32)((u64)(unsigned long)ds >> 32));
786 787 788 789
}

static inline void fini_debug_store_on_cpu(int cpu)
{
790
	if (!per_cpu(cpu_hw_events, cpu).ds)
791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808
		return;

	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
}

static void release_bts_hardware(void)
{
	int cpu;

	if (!bts_available())
		return;

	get_online_cpus();

	for_each_online_cpu(cpu)
		fini_debug_store_on_cpu(cpu);

	for_each_possible_cpu(cpu) {
809
		struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
810 811 812 813

		if (!ds)
			continue;

814
		per_cpu(cpu_hw_events, cpu).ds = NULL;
815

816
		kfree((void *)(unsigned long)ds->bts_buffer_base);
817 818 819 820 821 822 823 824 825 826 827
		kfree(ds);
	}

	put_online_cpus();
}

static int reserve_bts_hardware(void)
{
	int cpu, err = 0;

	if (!bts_available())
828
		return 0;
829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846

	get_online_cpus();

	for_each_possible_cpu(cpu) {
		struct debug_store *ds;
		void *buffer;

		err = -ENOMEM;
		buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
		if (unlikely(!buffer))
			break;

		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
		if (unlikely(!ds)) {
			kfree(buffer);
			break;
		}

847
		ds->bts_buffer_base = (u64)(unsigned long)buffer;
848 849 850 851 852 853
		ds->bts_index = ds->bts_buffer_base;
		ds->bts_absolute_maximum =
			ds->bts_buffer_base + BTS_BUFFER_SIZE;
		ds->bts_interrupt_threshold =
			ds->bts_absolute_maximum - BTS_OVFL_TH;

854
		per_cpu(cpu_hw_events, cpu).ds = ds;
855 856 857 858 859 860 861 862 863 864 865 866 867 868 869
		err = 0;
	}

	if (err)
		release_bts_hardware();
	else {
		for_each_online_cpu(cpu)
			init_debug_store_on_cpu(cpu);
	}

	put_online_cpus();

	return err;
}

870
static void hw_perf_event_destroy(struct perf_event *event)
P
Peter Zijlstra 已提交
871
{
872
	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
873
		release_pmc_hardware();
874
		release_bts_hardware();
P
Peter Zijlstra 已提交
875 876 877 878
		mutex_unlock(&pmc_reserve_mutex);
	}
}

879 880 881 882 883
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

884
static inline int
885
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916
{
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;

	return 0;
}

917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937
static void intel_pmu_enable_bts(u64 config)
{
	unsigned long debugctlmsr;

	debugctlmsr = get_debugctlmsr();

	debugctlmsr |= X86_DEBUGCTL_TR;
	debugctlmsr |= X86_DEBUGCTL_BTS;
	debugctlmsr |= X86_DEBUGCTL_BTINT;

	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;

	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;

	update_debugctlmsr(debugctlmsr);
}

static void intel_pmu_disable_bts(void)
{
938
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
939 940 941 942 943 944 945 946 947 948 949 950 951 952
	unsigned long debugctlmsr;

	if (!cpuc->ds)
		return;

	debugctlmsr = get_debugctlmsr();

	debugctlmsr &=
		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);

	update_debugctlmsr(debugctlmsr);
}

I
Ingo Molnar 已提交
953
/*
954
 * Setup the hardware configuration for a given attr_type
I
Ingo Molnar 已提交
955
 */
956
static int __hw_perf_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
957
{
958 959
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
960
	u64 config;
P
Peter Zijlstra 已提交
961
	int err;
I
Ingo Molnar 已提交
962

963 964
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
965

P
Peter Zijlstra 已提交
966
	err = 0;
967
	if (!atomic_inc_not_zero(&active_events)) {
P
Peter Zijlstra 已提交
968
		mutex_lock(&pmc_reserve_mutex);
969
		if (atomic_read(&active_events) == 0) {
970 971 972
			if (!reserve_pmc_hardware())
				err = -EBUSY;
			else
973
				err = reserve_bts_hardware();
974 975
		}
		if (!err)
976
			atomic_inc(&active_events);
P
Peter Zijlstra 已提交
977 978 979 980 981
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

982
	event->destroy = hw_perf_event_destroy;
983

I
Ingo Molnar 已提交
984
	/*
985
	 * Generate PMC IRQs:
I
Ingo Molnar 已提交
986 987
	 * (keep 'enabled' bit clear for now)
	 */
988
	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
I
Ingo Molnar 已提交
989

990 991
	hwc->idx = -1;

I
Ingo Molnar 已提交
992
	/*
993
	 * Count user and OS events unless requested not to.
I
Ingo Molnar 已提交
994
	 */
995
	if (!attr->exclude_user)
996
		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
997
	if (!attr->exclude_kernel)
I
Ingo Molnar 已提交
998
		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
999

1000
	if (!hwc->sample_period) {
1001
		hwc->sample_period = x86_pmu.max_period;
1002
		hwc->last_period = hwc->sample_period;
1003
		atomic64_set(&hwc->period_left, hwc->sample_period);
1004 1005 1006 1007
	} else {
		/*
		 * If we have a PMU initialized but no APIC
		 * interrupts, we cannot sample hardware
1008 1009
		 * events (user-space has to fall back and
		 * sample via a hrtimer based software event):
1010 1011 1012
		 */
		if (!x86_pmu.apic)
			return -EOPNOTSUPP;
1013
	}
1014

I
Ingo Molnar 已提交
1015
	/*
1016
	 * Raw hw_event type provide the config in the hw_event structure
I
Ingo Molnar 已提交
1017
	 */
1018 1019
	if (attr->type == PERF_TYPE_RAW) {
		hwc->config |= x86_pmu.raw_event(attr->config);
1020
		return 0;
I
Ingo Molnar 已提交
1021 1022
	}

1023 1024 1025 1026 1027
	if (attr->type == PERF_TYPE_HW_CACHE)
		return set_ext_hw_attr(hwc, attr);

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;
1028

1029 1030 1031
	/*
	 * The generic map:
	 */
1032 1033 1034 1035 1036 1037 1038 1039
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

1040 1041 1042 1043
	/*
	 * Branch tracing:
	 */
	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
1044 1045 1046 1047 1048 1049 1050 1051 1052
	    (hwc->sample_period == 1)) {
		/* BTS is not supported by this architecture. */
		if (!bts_available())
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
			return -EOPNOTSUPP;
	}
1053

1054
	hwc->config |= config;
P
Peter Zijlstra 已提交
1055

I
Ingo Molnar 已提交
1056 1057 1058
	return 0;
}

V
Vince Weaver 已提交
1059 1060
static void p6_pmu_disable_all(void)
{
1061
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1062
	u64 val;
V
Vince Weaver 已提交
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075

	if (!cpuc->enabled)
		return;

	cpuc->enabled = 0;
	barrier();

	/* p6 only has one enable register */
	rdmsrl(MSR_P6_EVNTSEL0, val);
	val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
	wrmsrl(MSR_P6_EVNTSEL0, val);
}

1076
static void intel_pmu_disable_all(void)
1077
{
1078
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1079 1080 1081 1082 1083 1084 1085

	if (!cpuc->enabled)
		return;

	cpuc->enabled = 0;
	barrier();

1086
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
1087 1088 1089

	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
		intel_pmu_disable_bts();
I
Ingo Molnar 已提交
1090
}
1091

1092
static void amd_pmu_disable_all(void)
1093
{
1094
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1095 1096 1097 1098
	int idx;

	if (!cpuc->enabled)
		return;
1099 1100

	cpuc->enabled = 0;
1101 1102
	/*
	 * ensure we write the disable before we start disabling the
1103
	 * events proper, so that amd_pmu_enable_event() does the
1104
	 * right thing.
1105
	 */
1106
	barrier();
1107

1108
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
1109 1110
		u64 val;

1111
		if (!test_bit(idx, cpuc->active_mask))
1112
			continue;
1113
		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
1114 1115 1116 1117
		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
			continue;
		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
1118 1119 1120
	}
}

1121
void hw_perf_disable(void)
1122
{
1123
	if (!x86_pmu_initialized())
1124 1125
		return;
	return x86_pmu.disable_all();
1126
}
I
Ingo Molnar 已提交
1127

V
Vince Weaver 已提交
1128 1129
static void p6_pmu_enable_all(void)
{
1130
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
V
Vince Weaver 已提交
1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
	unsigned long val;

	if (cpuc->enabled)
		return;

	cpuc->enabled = 1;
	barrier();

	/* p6 only has one enable register */
	rdmsrl(MSR_P6_EVNTSEL0, val);
	val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
	wrmsrl(MSR_P6_EVNTSEL0, val);
}

1145
static void intel_pmu_enable_all(void)
1146
{
1147
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1148 1149 1150 1151 1152 1153 1154

	if (cpuc->enabled)
		return;

	cpuc->enabled = 1;
	barrier();

1155
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1156 1157

	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
1158 1159
		struct perf_event *event =
			cpuc->events[X86_PMC_IDX_FIXED_BTS];
1160

1161
		if (WARN_ON_ONCE(!event))
1162 1163
			return;

1164
		intel_pmu_enable_bts(event->hw.config);
1165
	}
1166 1167
}

1168
static void amd_pmu_enable_all(void)
1169
{
1170
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1171 1172
	int idx;

1173
	if (cpuc->enabled)
1174 1175
		return;

1176 1177 1178
	cpuc->enabled = 1;
	barrier();

1179 1180
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
		struct perf_event *event = cpuc->events[idx];
1181
		u64 val;
1182

1183
		if (!test_bit(idx, cpuc->active_mask))
1184
			continue;
1185

1186
		val = event->hw.config;
1187 1188
		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
1189 1190 1191
	}
}

1192
void hw_perf_enable(void)
1193
{
1194
	if (!x86_pmu_initialized())
1195
		return;
1196
	x86_pmu.enable_all();
1197 1198
}

1199
static inline u64 intel_pmu_get_status(void)
1200 1201 1202
{
	u64 status;

1203
	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1204

1205
	return status;
1206 1207
}

1208
static inline void intel_pmu_ack_status(u64 ack)
1209 1210 1211 1212
{
	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
}

1213
static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1214
{
V
Vince Weaver 已提交
1215
	(void)checking_wrmsrl(hwc->config_base + idx,
1216
			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
1217 1218
}

1219
static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1220
{
V
Vince Weaver 已提交
1221
	(void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
1222 1223
}

1224
static inline void
1225
intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)
1226 1227 1228 1229 1230 1231 1232 1233
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, mask;

	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
V
Vince Weaver 已提交
1234 1235 1236 1237
	(void)checking_wrmsrl(hwc->config_base, ctrl_val);
}

static inline void
1238
p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)
V
Vince Weaver 已提交
1239
{
1240 1241
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	u64 val = P6_NOP_EVENT;
V
Vince Weaver 已提交
1242

1243 1244
	if (cpuc->enabled)
		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
V
Vince Weaver 已提交
1245 1246

	(void)checking_wrmsrl(hwc->config_base + idx, val);
1247 1248
}

1249
static inline void
1250
intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1251
{
1252 1253 1254 1255 1256
	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
		intel_pmu_disable_bts();
		return;
	}

1257 1258 1259 1260 1261
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
		intel_pmu_disable_fixed(hwc, idx);
		return;
	}

1262
	x86_pmu_disable_event(hwc, idx);
1263 1264 1265
}

static inline void
1266
amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1267
{
1268
	x86_pmu_disable_event(hwc, idx);
1269 1270
}

1271
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
I
Ingo Molnar 已提交
1272

1273 1274
/*
 * Set the next IRQ period, based on the hwc->period_left value.
1275
 * To be called with the event disabled in hw:
1276
 */
1277
static int
1278 1279
x86_perf_event_set_period(struct perf_event *event,
			     struct hw_perf_event *hwc, int idx)
I
Ingo Molnar 已提交
1280
{
1281
	s64 left = atomic64_read(&hwc->period_left);
1282 1283
	s64 period = hwc->sample_period;
	int err, ret = 0;
1284

1285 1286 1287
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

1288 1289 1290 1291 1292 1293
	/*
	 * If we are way outside a reasoable range then just skip forward:
	 */
	if (unlikely(left <= -period)) {
		left = period;
		atomic64_set(&hwc->period_left, left);
1294
		hwc->last_period = period;
1295
		ret = 1;
1296 1297 1298 1299 1300
	}

	if (unlikely(left <= 0)) {
		left += period;
		atomic64_set(&hwc->period_left, left);
1301
		hwc->last_period = period;
1302
		ret = 1;
1303
	}
1304
	/*
1305
	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1306 1307 1308
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
1309

1310 1311 1312
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

1313
	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1314 1315

	/*
1316
	 * The hw event starts counting from this event offset,
1317 1318
	 * mark it to be able to extra future deltas:
	 */
1319
	atomic64_set(&hwc->prev_count, (u64)-left);
1320

1321 1322
	err = checking_wrmsrl(hwc->event_base + idx,
			     (u64)(-left) & x86_pmu.event_mask);
1323

1324
	perf_event_update_userpage(event);
1325

1326
	return ret;
1327 1328 1329
}

static inline void
1330
intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1331 1332 1333 1334 1335 1336
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, bits, mask;
	int err;

	/*
1337 1338 1339
	 * Enable IRQ generation (0x8),
	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
	 * if requested:
1340
	 */
1341 1342 1343
	bits = 0x8ULL;
	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
		bits |= 0x2;
1344 1345 1346 1347 1348 1349 1350 1351 1352
	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
		bits |= 0x1;
	bits <<= (idx * 4);
	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
	ctrl_val |= bits;
	err = checking_wrmsrl(hwc->config_base, ctrl_val);
1353 1354
}

1355
static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
V
Vince Weaver 已提交
1356
{
1357
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1358
	u64 val;
V
Vince Weaver 已提交
1359

1360
	val = hwc->config;
V
Vince Weaver 已提交
1361
	if (cpuc->enabled)
1362 1363 1364
		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;

	(void)checking_wrmsrl(hwc->config_base + idx, val);
V
Vince Weaver 已提交
1365 1366 1367
}


1368
static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1369
{
1370
	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1371
		if (!__get_cpu_var(cpu_hw_events).enabled)
1372 1373 1374 1375 1376 1377
			return;

		intel_pmu_enable_bts(hwc->config);
		return;
	}

1378 1379 1380 1381 1382
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
		intel_pmu_enable_fixed(hwc, idx);
		return;
	}

1383
	x86_pmu_enable_event(hwc, idx);
1384 1385
}

1386
static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1387
{
1388
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1389 1390

	if (cpuc->enabled)
1391
		x86_pmu_enable_event(hwc, idx);
I
Ingo Molnar 已提交
1392 1393
}

1394
static int fixed_mode_idx(struct hw_perf_event *hwc)
1395
{
1396
	unsigned int hw_event;
1397

1398
	hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1399

1400
	if (unlikely((hw_event ==
1401 1402 1403 1404
		      x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
		     (hwc->sample_period == 1)))
		return X86_PMC_IDX_FIXED_BTS;

1405
	if (!x86_pmu.num_events_fixed)
1406 1407
		return -1;

1408 1409 1410 1411 1412 1413
	/*
	 * fixed counters do not take all possible filters
	 */
	if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
		return -1;

1414
	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1415
		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1416
	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
1417
		return X86_PMC_IDX_FIXED_CPU_CYCLES;
1418
	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
1419 1420
		return X86_PMC_IDX_FIXED_BUS_CYCLES;

1421 1422 1423
	return -1;
}

1424 1425 1426
/*
 * generic counter allocator: get next free counter
 */
1427 1428
static int
gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1429 1430 1431 1432 1433 1434 1435 1436 1437 1438
{
	int idx;

	idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
	return idx == x86_pmu.num_events ? -1 : idx;
}

/*
 * intel-specific counter allocator: check event constraints
 */
1439 1440
static int
intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1441 1442 1443 1444
{
	const struct event_constraint *event_constraint;
	int i, code;

1445
	if (!event_constraints)
1446 1447
		goto skip;

1448
	code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1449

1450
	for_each_event_constraint(event_constraint, event_constraints) {
1451 1452 1453 1454 1455 1456 1457 1458 1459
		if (code == event_constraint->code) {
			for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
				if (!test_and_set_bit(i, cpuc->used_mask))
					return i;
			}
			return -1;
		}
	}
skip:
1460
	return gen_get_event_idx(cpuc, hwc);
1461 1462
}

1463 1464
static int
x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
I
Ingo Molnar 已提交
1465
{
1466
	int idx;
I
Ingo Molnar 已提交
1467

1468
	idx = fixed_mode_idx(hwc);
1469
	if (idx == X86_PMC_IDX_FIXED_BTS) {
1470
		/* BTS is already occupied. */
1471
		if (test_and_set_bit(idx, cpuc->used_mask))
1472
			return -EAGAIN;
1473 1474

		hwc->config_base	= 0;
1475
		hwc->event_base		= 0;
1476 1477
		hwc->idx		= idx;
	} else if (idx >= 0) {
1478
		/*
1479 1480
		 * Try to get the fixed event, if that is already taken
		 * then try to get a generic event:
1481
		 */
1482
		if (test_and_set_bit(idx, cpuc->used_mask))
1483
			goto try_generic;
1484

1485 1486
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		/*
1487
		 * We set it so that event_base + idx in wrmsr/rdmsr maps to
1488 1489
		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
		 */
1490
		hwc->event_base =
1491
			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
1492
		hwc->idx = idx;
1493 1494
	} else {
		idx = hwc->idx;
1495
		/* Try to get the previous generic event again */
1496
		if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1497
try_generic:
1498
			idx = x86_pmu.get_event_idx(cpuc, hwc);
1499
			if (idx == -1)
1500 1501
				return -EAGAIN;

1502
			set_bit(idx, cpuc->used_mask);
1503 1504
			hwc->idx = idx;
		}
1505 1506
		hwc->config_base = x86_pmu.eventsel;
		hwc->event_base  = x86_pmu.perfctr;
I
Ingo Molnar 已提交
1507 1508
	}

1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524
	return idx;
}

/*
 * Find a PMC slot for the freshly enabled / scheduled in event:
 */
static int x86_pmu_enable(struct perf_event *event)
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	struct hw_perf_event *hwc = &event->hw;
	int idx;

	idx = x86_schedule_event(cpuc, hwc);
	if (idx < 0)
		return idx;

1525
	perf_events_lapic_init();
1526

1527
	x86_pmu.disable(hwc, idx);
I
Ingo Molnar 已提交
1528

1529
	cpuc->events[idx] = event;
1530
	set_bit(idx, cpuc->active_mask);
1531

1532
	x86_perf_event_set_period(event, hwc, idx);
1533
	x86_pmu.enable(hwc, idx);
1534

1535
	perf_event_update_userpage(event);
1536

1537
	return 0;
I
Ingo Molnar 已提交
1538 1539
}

1540
static void x86_pmu_unthrottle(struct perf_event *event)
1541
{
1542 1543
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	struct hw_perf_event *hwc = &event->hw;
1544 1545

	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1546
				cpuc->events[hwc->idx] != event))
1547 1548 1549 1550 1551
		return;

	x86_pmu.enable(hwc, hwc->idx);
}

1552
void perf_event_print_debug(void)
I
Ingo Molnar 已提交
1553
{
1554
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1555
	struct cpu_hw_events *cpuc;
1556
	unsigned long flags;
1557 1558
	int cpu, idx;

1559
	if (!x86_pmu.num_events)
1560
		return;
I
Ingo Molnar 已提交
1561

1562
	local_irq_save(flags);
I
Ingo Molnar 已提交
1563 1564

	cpu = smp_processor_id();
1565
	cpuc = &per_cpu(cpu_hw_events, cpu);
I
Ingo Molnar 已提交
1566

1567
	if (x86_pmu.version >= 2) {
1568 1569 1570 1571 1572 1573 1574 1575 1576 1577
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1578
	}
1579
	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
I
Ingo Molnar 已提交
1580

1581
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
1582 1583
		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
I
Ingo Molnar 已提交
1584

1585
		prev_left = per_cpu(pmc_prev_left[idx], cpu);
I
Ingo Molnar 已提交
1586

1587
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
1588
			cpu, idx, pmc_ctrl);
1589
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
1590
			cpu, idx, pmc_count);
1591
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1592
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
1593
	}
1594
	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1595 1596
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

1597
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1598 1599
			cpu, idx, pmc_count);
	}
1600
	local_irq_restore(flags);
I
Ingo Molnar 已提交
1601 1602
}

1603
static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
1604 1605 1606 1607 1608 1609 1610
{
	struct debug_store *ds = cpuc->ds;
	struct bts_record {
		u64	from;
		u64	to;
		u64	flags;
	};
1611
	struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
1612
	struct bts_record *at, *top;
1613 1614 1615 1616
	struct perf_output_handle handle;
	struct perf_event_header header;
	struct perf_sample_data data;
	struct pt_regs regs;
1617

1618
	if (!event)
1619 1620 1621 1622 1623
		return;

	if (!ds)
		return;

1624 1625
	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
	top = (struct bts_record *)(unsigned long)ds->bts_index;
1626

1627 1628 1629
	if (top <= at)
		return;

1630 1631
	ds->bts_index = ds->bts_buffer_base;

1632

1633
	data.period	= event->hw.last_period;
1634 1635 1636 1637 1638 1639 1640 1641
	data.addr	= 0;
	regs.ip		= 0;

	/*
	 * Prepare a generic sample, i.e. fill in the invariant fields.
	 * We will overwrite the from and to address before we output
	 * the sample.
	 */
1642
	perf_prepare_sample(&header, &data, event, &regs);
1643

1644
	if (perf_output_begin(&handle, event,
1645 1646 1647
			      header.size * (top - at), 1, 1))
		return;

1648
	for (; at < top; at++) {
1649 1650
		data.ip		= at->from;
		data.addr	= at->to;
1651

1652
		perf_output_sample(&handle, &header, &data, event);
1653 1654
	}

1655
	perf_output_end(&handle);
1656 1657

	/* There's new data available. */
1658 1659
	event->hw.interrupts++;
	event->pending_kill = POLL_IN;
1660 1661
}

1662
static void x86_pmu_disable(struct perf_event *event)
I
Ingo Molnar 已提交
1663
{
1664 1665
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	struct hw_perf_event *hwc = &event->hw;
1666
	int idx = hwc->idx;
I
Ingo Molnar 已提交
1667

1668 1669 1670 1671
	/*
	 * Must be done before we disable, otherwise the nmi handler
	 * could reenable again:
	 */
1672
	clear_bit(idx, cpuc->active_mask);
1673
	x86_pmu.disable(hwc, idx);
I
Ingo Molnar 已提交
1674

1675 1676
	/*
	 * Make sure the cleared pointer becomes visible before we
1677
	 * (potentially) free the event:
1678
	 */
1679
	barrier();
I
Ingo Molnar 已提交
1680

1681
	/*
1682
	 * Drain the remaining delta count out of a event
1683 1684
	 * that we are disabling:
	 */
1685
	x86_perf_event_update(event, hwc, idx);
1686 1687

	/* Drain the remaining BTS records. */
1688 1689
	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
		intel_pmu_drain_bts_buffer(cpuc);
1690

1691
	cpuc->events[idx] = NULL;
1692
	clear_bit(idx, cpuc->used_mask);
1693

1694
	perf_event_update_userpage(event);
I
Ingo Molnar 已提交
1695 1696
}

1697
/*
1698 1699
 * Save and restart an expired event. Called by NMI contexts,
 * so it has to be careful about preempting normal event ops:
1700
 */
1701
static int intel_pmu_save_and_restart(struct perf_event *event)
I
Ingo Molnar 已提交
1702
{
1703
	struct hw_perf_event *hwc = &event->hw;
I
Ingo Molnar 已提交
1704
	int idx = hwc->idx;
1705
	int ret;
I
Ingo Molnar 已提交
1706

1707 1708
	x86_perf_event_update(event, hwc, idx);
	ret = x86_perf_event_set_period(event, hwc, idx);
1709

1710 1711
	if (event->state == PERF_EVENT_STATE_ACTIVE)
		intel_pmu_enable_event(hwc, idx);
1712 1713

	return ret;
I
Ingo Molnar 已提交
1714 1715
}

1716 1717
static void intel_pmu_reset(void)
{
1718
	struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
1719 1720 1721
	unsigned long flags;
	int idx;

1722
	if (!x86_pmu.num_events)
1723 1724 1725 1726 1727 1728
		return;

	local_irq_save(flags);

	printk("clearing PMU state on CPU#%d\n", smp_processor_id());

1729
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
1730 1731 1732
		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
	}
1733
	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1734 1735
		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
	}
1736 1737
	if (ds)
		ds->bts_index = ds->bts_buffer_base;
1738 1739 1740 1741

	local_irq_restore(flags);
}

V
Vince Weaver 已提交
1742 1743 1744
static int p6_pmu_handle_irq(struct pt_regs *regs)
{
	struct perf_sample_data data;
1745 1746 1747
	struct cpu_hw_events *cpuc;
	struct perf_event *event;
	struct hw_perf_event *hwc;
V
Vince Weaver 已提交
1748 1749 1750 1751 1752
	int idx, handled = 0;
	u64 val;

	data.addr = 0;

1753
	cpuc = &__get_cpu_var(cpu_hw_events);
V
Vince Weaver 已提交
1754

1755
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
V
Vince Weaver 已提交
1756 1757 1758
		if (!test_bit(idx, cpuc->active_mask))
			continue;

1759 1760
		event = cpuc->events[idx];
		hwc = &event->hw;
V
Vince Weaver 已提交
1761

1762 1763
		val = x86_perf_event_update(event, hwc, idx);
		if (val & (1ULL << (x86_pmu.event_bits - 1)))
V
Vince Weaver 已提交
1764 1765 1766
			continue;

		/*
1767
		 * event overflow
V
Vince Weaver 已提交
1768 1769
		 */
		handled		= 1;
1770
		data.period	= event->hw.last_period;
V
Vince Weaver 已提交
1771

1772
		if (!x86_perf_event_set_period(event, hwc, idx))
V
Vince Weaver 已提交
1773 1774
			continue;

1775 1776
		if (perf_event_overflow(event, 1, &data, regs))
			p6_pmu_disable_event(hwc, idx);
V
Vince Weaver 已提交
1777 1778 1779 1780 1781 1782 1783
	}

	if (handled)
		inc_irq_stat(apic_perf_irqs);

	return handled;
}
1784

I
Ingo Molnar 已提交
1785 1786 1787 1788
/*
 * This handler is triggered by the local APIC, so the APIC IRQ handling
 * rules apply:
 */
1789
static int intel_pmu_handle_irq(struct pt_regs *regs)
I
Ingo Molnar 已提交
1790
{
1791
	struct perf_sample_data data;
1792
	struct cpu_hw_events *cpuc;
V
Vince Weaver 已提交
1793
	int bit, loops;
1794
	u64 ack, status;
1795

1796 1797
	data.addr = 0;

1798
	cpuc = &__get_cpu_var(cpu_hw_events);
I
Ingo Molnar 已提交
1799

1800
	perf_disable();
1801
	intel_pmu_drain_bts_buffer(cpuc);
1802
	status = intel_pmu_get_status();
1803 1804 1805 1806
	if (!status) {
		perf_enable();
		return 0;
	}
1807

1808
	loops = 0;
I
Ingo Molnar 已提交
1809
again:
1810
	if (++loops > 100) {
1811 1812
		WARN_ONCE(1, "perfevents: irq loop stuck!\n");
		perf_event_print_debug();
1813 1814
		intel_pmu_reset();
		perf_enable();
1815 1816 1817
		return 1;
	}

1818
	inc_irq_stat(apic_perf_irqs);
I
Ingo Molnar 已提交
1819
	ack = status;
1820
	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1821
		struct perf_event *event = cpuc->events[bit];
I
Ingo Molnar 已提交
1822 1823

		clear_bit(bit, (unsigned long *) &status);
1824
		if (!test_bit(bit, cpuc->active_mask))
I
Ingo Molnar 已提交
1825 1826
			continue;

1827
		if (!intel_pmu_save_and_restart(event))
1828 1829
			continue;

1830
		data.period = event->hw.last_period;
1831

1832 1833
		if (perf_event_overflow(event, 1, &data, regs))
			intel_pmu_disable_event(&event->hw, bit);
I
Ingo Molnar 已提交
1834 1835
	}

1836
	intel_pmu_ack_status(ack);
I
Ingo Molnar 已提交
1837 1838 1839 1840

	/*
	 * Repeat if there is more work to be done:
	 */
1841
	status = intel_pmu_get_status();
I
Ingo Molnar 已提交
1842 1843
	if (status)
		goto again;
1844

1845
	perf_enable();
1846 1847

	return 1;
1848 1849
}

1850
static int amd_pmu_handle_irq(struct pt_regs *regs)
1851
{
1852
	struct perf_sample_data data;
1853 1854 1855
	struct cpu_hw_events *cpuc;
	struct perf_event *event;
	struct hw_perf_event *hwc;
V
Vince Weaver 已提交
1856
	int idx, handled = 0;
1857 1858
	u64 val;

1859 1860
	data.addr = 0;

1861
	cpuc = &__get_cpu_var(cpu_hw_events);
1862

1863
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
1864
		if (!test_bit(idx, cpuc->active_mask))
1865
			continue;
1866

1867 1868
		event = cpuc->events[idx];
		hwc = &event->hw;
1869

1870 1871
		val = x86_perf_event_update(event, hwc, idx);
		if (val & (1ULL << (x86_pmu.event_bits - 1)))
1872
			continue;
1873

1874
		/*
1875
		 * event overflow
1876 1877
		 */
		handled		= 1;
1878
		data.period	= event->hw.last_period;
1879

1880
		if (!x86_perf_event_set_period(event, hwc, idx))
1881 1882
			continue;

1883 1884
		if (perf_event_overflow(event, 1, &data, regs))
			amd_pmu_disable_event(hwc, idx);
1885
	}
1886

1887 1888 1889
	if (handled)
		inc_irq_stat(apic_perf_irqs);

1890 1891
	return handled;
}
1892

1893 1894 1895 1896 1897
void smp_perf_pending_interrupt(struct pt_regs *regs)
{
	irq_enter();
	ack_APIC_irq();
	inc_irq_stat(apic_pending_irqs);
1898
	perf_event_do_pending();
1899 1900 1901
	irq_exit();
}

1902
void set_perf_event_pending(void)
1903
{
1904
#ifdef CONFIG_X86_LOCAL_APIC
1905 1906 1907
	if (!x86_pmu.apic || !x86_pmu_initialized())
		return;

1908
	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1909
#endif
1910 1911
}

1912
void perf_events_lapic_init(void)
I
Ingo Molnar 已提交
1913
{
1914 1915
#ifdef CONFIG_X86_LOCAL_APIC
	if (!x86_pmu.apic || !x86_pmu_initialized())
I
Ingo Molnar 已提交
1916
		return;
1917

I
Ingo Molnar 已提交
1918
	/*
1919
	 * Always use NMI for PMU
I
Ingo Molnar 已提交
1920
	 */
1921
	apic_write(APIC_LVTPC, APIC_DM_NMI);
1922
#endif
I
Ingo Molnar 已提交
1923 1924 1925
}

static int __kprobes
1926
perf_event_nmi_handler(struct notifier_block *self,
I
Ingo Molnar 已提交
1927 1928 1929 1930
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
	struct pt_regs *regs;
1931

1932
	if (!atomic_read(&active_events))
1933 1934
		return NOTIFY_DONE;

1935 1936 1937 1938
	switch (cmd) {
	case DIE_NMI:
	case DIE_NMI_IPI:
		break;
I
Ingo Molnar 已提交
1939

1940
	default:
I
Ingo Molnar 已提交
1941
		return NOTIFY_DONE;
1942
	}
I
Ingo Molnar 已提交
1943 1944 1945

	regs = args->regs;

1946
#ifdef CONFIG_X86_LOCAL_APIC
I
Ingo Molnar 已提交
1947
	apic_write(APIC_LVTPC, APIC_DM_NMI);
1948
#endif
1949 1950
	/*
	 * Can't rely on the handled return value to say it was our NMI, two
1951
	 * events could trigger 'simultaneously' raising two back-to-back NMIs.
1952 1953 1954 1955
	 *
	 * If the first NMI handles both, the latter will be empty and daze
	 * the CPU.
	 */
1956
	x86_pmu.handle_irq(regs);
I
Ingo Molnar 已提交
1957

1958
	return NOTIFY_STOP;
I
Ingo Molnar 已提交
1959 1960
}

1961 1962
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
	.notifier_call		= perf_event_nmi_handler,
1963 1964
	.next			= NULL,
	.priority		= 1
I
Ingo Molnar 已提交
1965 1966
};

1967
static __initconst struct x86_pmu p6_pmu = {
V
Vince Weaver 已提交
1968 1969 1970 1971
	.name			= "p6",
	.handle_irq		= p6_pmu_handle_irq,
	.disable_all		= p6_pmu_disable_all,
	.enable_all		= p6_pmu_enable_all,
1972 1973
	.enable			= p6_pmu_enable_event,
	.disable		= p6_pmu_disable_event,
V
Vince Weaver 已提交
1974 1975 1976 1977 1978
	.eventsel		= MSR_P6_EVNTSEL0,
	.perfctr		= MSR_P6_PERFCTR0,
	.event_map		= p6_pmu_event_map,
	.raw_event		= p6_pmu_raw_event,
	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
1979
	.apic			= 1,
V
Vince Weaver 已提交
1980 1981
	.max_period		= (1ULL << 31) - 1,
	.version		= 0,
1982
	.num_events		= 2,
V
Vince Weaver 已提交
1983
	/*
1984
	 * Events have 40 bits implemented. However they are designed such
V
Vince Weaver 已提交
1985
	 * that bits [32-39] are sign extensions of bit 31. As such the
1986
	 * effective width of a event for P6-like PMU is 32 bits only.
V
Vince Weaver 已提交
1987 1988 1989
	 *
	 * See IA-32 Intel Architecture Software developer manual Vol 3B
	 */
1990 1991
	.event_bits		= 32,
	.event_mask		= (1ULL << 32) - 1,
1992
	.get_event_idx		= intel_get_event_idx,
V
Vince Weaver 已提交
1993 1994
};

1995
static __initconst struct x86_pmu intel_pmu = {
1996
	.name			= "Intel",
1997
	.handle_irq		= intel_pmu_handle_irq,
1998 1999
	.disable_all		= intel_pmu_disable_all,
	.enable_all		= intel_pmu_enable_all,
2000 2001
	.enable			= intel_pmu_enable_event,
	.disable		= intel_pmu_disable_event,
2002 2003
	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
2004 2005
	.event_map		= intel_pmu_event_map,
	.raw_event		= intel_pmu_raw_event,
2006
	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
2007
	.apic			= 1,
2008 2009 2010
	/*
	 * Intel PMCs cannot be accessed sanely above 32 bit width,
	 * so we install an artificial 1<<31 period regardless of
2011
	 * the generic event period:
2012 2013
	 */
	.max_period		= (1ULL << 31) - 1,
2014 2015
	.enable_bts		= intel_pmu_enable_bts,
	.disable_bts		= intel_pmu_disable_bts,
2016
	.get_event_idx		= intel_get_event_idx,
2017 2018
};

2019
static __initconst struct x86_pmu amd_pmu = {
2020
	.name			= "AMD",
2021
	.handle_irq		= amd_pmu_handle_irq,
2022 2023
	.disable_all		= amd_pmu_disable_all,
	.enable_all		= amd_pmu_enable_all,
2024 2025
	.enable			= amd_pmu_enable_event,
	.disable		= amd_pmu_disable_event,
2026 2027
	.eventsel		= MSR_K7_EVNTSEL0,
	.perfctr		= MSR_K7_PERFCTR0,
2028 2029
	.event_map		= amd_pmu_event_map,
	.raw_event		= amd_pmu_raw_event,
2030
	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
2031 2032 2033
	.num_events		= 4,
	.event_bits		= 48,
	.event_mask		= (1ULL << 48) - 1,
2034
	.apic			= 1,
2035 2036
	/* use highest bit to detect overflow */
	.max_period		= (1ULL << 47) - 1,
2037
	.get_event_idx		= gen_get_event_idx,
2038 2039
};

2040
static __init int p6_pmu_init(void)
V
Vince Weaver 已提交
2041 2042 2043 2044 2045 2046 2047 2048 2049
{
	switch (boot_cpu_data.x86_model) {
	case 1:
	case 3:  /* Pentium Pro */
	case 5:
	case 6:  /* Pentium II */
	case 7:
	case 8:
	case 11: /* Pentium III */
2050
		event_constraints = intel_p6_event_constraints;
V
Vince Weaver 已提交
2051 2052 2053
		break;
	case 9:
	case 13:
2054
		/* Pentium M */
2055
		event_constraints = intel_p6_event_constraints;
2056
		break;
V
Vince Weaver 已提交
2057 2058 2059 2060 2061 2062
	default:
		pr_cont("unsupported p6 CPU model %d ",
			boot_cpu_data.x86_model);
		return -ENODEV;
	}

2063 2064
	x86_pmu = p6_pmu;

V
Vince Weaver 已提交
2065
	if (!cpu_has_apic) {
2066
		pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
2067 2068
		pr_info("no hardware sampling interrupt available.\n");
		x86_pmu.apic = 0;
V
Vince Weaver 已提交
2069 2070 2071 2072 2073
	}

	return 0;
}

2074
static __init int intel_pmu_init(void)
I
Ingo Molnar 已提交
2075
{
2076
	union cpuid10_edx edx;
I
Ingo Molnar 已提交
2077
	union cpuid10_eax eax;
2078
	unsigned int unused;
2079
	unsigned int ebx;
2080
	int version;
I
Ingo Molnar 已提交
2081

V
Vince Weaver 已提交
2082 2083 2084 2085 2086
	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
		/* check for P6 processor family */
	   if (boot_cpu_data.x86 == 6) {
		return p6_pmu_init();
	   } else {
2087
		return -ENODEV;
V
Vince Weaver 已提交
2088 2089
	   }
	}
2090

I
Ingo Molnar 已提交
2091 2092
	/*
	 * Check whether the Architectural PerfMon supports
2093
	 * Branch Misses Retired hw_event or not.
I
Ingo Molnar 已提交
2094
	 */
2095
	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
I
Ingo Molnar 已提交
2096
	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
2097
		return -ENODEV;
I
Ingo Molnar 已提交
2098

2099 2100
	version = eax.split.version_id;
	if (version < 2)
2101
		return -ENODEV;
2102

2103 2104
	x86_pmu				= intel_pmu;
	x86_pmu.version			= version;
2105 2106 2107
	x86_pmu.num_events		= eax.split.num_events;
	x86_pmu.event_bits		= eax.split.bit_width;
	x86_pmu.event_mask		= (1ULL << eax.split.bit_width) - 1;
2108 2109

	/*
2110 2111
	 * Quirk: v2 perfmon does not report fixed-purpose events, so
	 * assume at least 3 events:
2112
	 */
2113
	x86_pmu.num_events_fixed	= max((int)edx.split.num_events_fixed, 3);
2114

2115
	/*
2116
	 * Install the hw-cache-events table:
2117 2118
	 */
	switch (boot_cpu_data.x86_model) {
2119 2120 2121 2122
	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
	case 29: /* six-core 45 nm xeon "Dunnington" */
2123
		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
2124
		       sizeof(hw_cache_event_ids));
2125

2126
		pr_cont("Core2 events, ");
2127
		event_constraints = intel_core_event_constraints;
2128 2129 2130 2131
		break;
	default:
	case 26:
		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
2132
		       sizeof(hw_cache_event_ids));
2133

2134
		event_constraints = intel_nehalem_event_constraints;
2135
		pr_cont("Nehalem/Corei7 events, ");
2136 2137 2138
		break;
	case 28:
		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
2139
		       sizeof(hw_cache_event_ids));
2140

2141
		pr_cont("Atom events, ");
2142 2143
		break;
	}
2144
	return 0;
2145 2146
}

2147
static __init int amd_pmu_init(void)
2148
{
2149 2150 2151 2152
	/* Performance-monitoring supported from K7 and later: */
	if (boot_cpu_data.x86 < 6)
		return -ENODEV;

2153
	x86_pmu = amd_pmu;
2154

2155 2156 2157
	/* Events are common for all AMDs */
	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
	       sizeof(hw_cache_event_ids));
2158

2159
	return 0;
2160 2161
}

2162
void __init init_hw_perf_events(void)
2163
{
2164 2165
	int err;

2166
	pr_info("Performance Events: ");
2167

2168 2169
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
2170
		err = intel_pmu_init();
2171
		break;
2172
	case X86_VENDOR_AMD:
2173
		err = amd_pmu_init();
2174
		break;
2175 2176
	default:
		return;
2177
	}
2178
	if (err != 0) {
2179
		pr_cont("no PMU driver, software events only.\n");
2180
		return;
2181
	}
2182

2183
	pr_cont("%s PMU driver.\n", x86_pmu.name);
2184

2185 2186 2187 2188
	if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
		x86_pmu.num_events = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
2189
	}
2190 2191
	perf_event_mask = (1 << x86_pmu.num_events) - 1;
	perf_max_events = x86_pmu.num_events;
I
Ingo Molnar 已提交
2192

2193 2194 2195 2196
	if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
		     x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
		x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
2197
	}
2198

2199 2200 2201
	perf_event_mask |=
		((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
	x86_pmu.intel_ctrl = perf_event_mask;
I
Ingo Molnar 已提交
2202

2203 2204
	perf_events_lapic_init();
	register_die_notifier(&perf_event_nmi_notifier);
2205

I
Ingo Molnar 已提交
2206 2207 2208 2209 2210 2211 2212
	pr_info("... version:                %d\n",     x86_pmu.version);
	pr_info("... bit width:              %d\n",     x86_pmu.event_bits);
	pr_info("... generic registers:      %d\n",     x86_pmu.num_events);
	pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask);
	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);
	pr_info("... event mask:             %016Lx\n", perf_event_mask);
I
Ingo Molnar 已提交
2213
}
I
Ingo Molnar 已提交
2214

2215
static inline void x86_pmu_read(struct perf_event *event)
2216
{
2217
	x86_perf_event_update(event, &event->hw, event->hw.idx);
2218 2219
}

2220 2221 2222 2223
static const struct pmu pmu = {
	.enable		= x86_pmu_enable,
	.disable	= x86_pmu_disable,
	.read		= x86_pmu_read,
2224
	.unthrottle	= x86_pmu_unthrottle,
I
Ingo Molnar 已提交
2225 2226
};

2227 2228 2229 2230 2231
static int
validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
{
	struct hw_perf_event fake_event = event->hw;

2232
	if (event->pmu && event->pmu != &pmu)
2233 2234
		return 0;

2235
	return x86_schedule_event(cpuc, &fake_event) >= 0;
2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258
}

static int validate_group(struct perf_event *event)
{
	struct perf_event *sibling, *leader = event->group_leader;
	struct cpu_hw_events fake_pmu;

	memset(&fake_pmu, 0, sizeof(fake_pmu));

	if (!validate_event(&fake_pmu, leader))
		return -ENOSPC;

	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
		if (!validate_event(&fake_pmu, sibling))
			return -ENOSPC;
	}

	if (!validate_event(&fake_pmu, event))
		return -ENOSPC;

	return 0;
}

2259
const struct pmu *hw_perf_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
2260 2261 2262
{
	int err;

2263
	err = __hw_perf_event_init(event);
2264 2265 2266 2267
	if (!err) {
		if (event->group_leader != event)
			err = validate_group(event);
	}
2268
	if (err) {
2269 2270
		if (event->destroy)
			event->destroy(event);
2271
		return ERR_PTR(err);
2272
	}
I
Ingo Molnar 已提交
2273

2274
	return &pmu;
I
Ingo Molnar 已提交
2275
}
2276 2277 2278 2279 2280 2281

/*
 * callchain support
 */

static inline
2282
void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2283
{
2284
	if (entry->nr < PERF_MAX_STACK_DEPTH)
2285 2286 2287
		entry->ip[entry->nr++] = ip;
}

2288 2289
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2290
static DEFINE_PER_CPU(int, in_nmi_frame);
2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305


static void
backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
{
	/* Ignore warnings */
}

static void backtrace_warning(void *data, char *msg)
{
	/* Ignore warnings */
}

static int backtrace_stack(void *data, char *name)
{
2306 2307 2308
	per_cpu(in_nmi_frame, smp_processor_id()) =
			x86_is_stack_id(NMI_STACK, name);

2309
	return 0;
2310 2311 2312 2313 2314 2315
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

2316 2317 2318
	if (per_cpu(in_nmi_frame, smp_processor_id()))
		return;

2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329
	if (reliable)
		callchain_store(entry, addr);
}

static const struct stacktrace_ops backtrace_ops = {
	.warning		= backtrace_warning,
	.warning_symbol		= backtrace_warning_symbol,
	.stack			= backtrace_stack,
	.address		= backtrace_address,
};

2330 2331
#include "../dumpstack.h"

2332 2333 2334
static void
perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
2335
	callchain_store(entry, PERF_CONTEXT_KERNEL);
2336
	callchain_store(entry, regs->ip);
2337

2338
	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
2339 2340
}

2341 2342 2343 2344 2345
/*
 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
 */
static unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
2346
{
2347 2348 2349 2350 2351
	unsigned long offset, addr = (unsigned long)from;
	int type = in_nmi() ? KM_NMI : KM_IRQ0;
	unsigned long size, len = 0;
	struct page *page;
	void *map;
2352 2353
	int ret;

2354 2355 2356 2357
	do {
		ret = __get_user_pages_fast(addr, 1, 0, &page);
		if (!ret)
			break;
2358

2359 2360
		offset = addr & (PAGE_SIZE - 1);
		size = min(PAGE_SIZE - offset, n - len);
2361

2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382
		map = kmap_atomic(page, type);
		memcpy(to, map+offset, size);
		kunmap_atomic(map, type);
		put_page(page);

		len  += size;
		to   += size;
		addr += size;

	} while (len < n);

	return len;
}

static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
{
	unsigned long bytes;

	bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));

	return bytes == sizeof(*frame);
2383 2384 2385 2386 2387 2388 2389 2390
}

static void
perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	struct stack_frame frame;
	const void __user *fp;

2391 2392 2393
	if (!user_mode(regs))
		regs = task_pt_regs(current);

2394
	fp = (void __user *)regs->bp;
2395

2396
	callchain_store(entry, PERF_CONTEXT_USER);
2397 2398
	callchain_store(entry, regs->ip);

2399
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
2400
		frame.next_frame	     = NULL;
2401 2402 2403 2404 2405
		frame.return_address = 0;

		if (!copy_stack_frame(fp, &frame))
			break;

2406
		if ((unsigned long)fp < regs->sp)
2407 2408 2409
			break;

		callchain_store(entry, frame.return_address);
2410
		fp = frame.next_frame;
2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441
	}
}

static void
perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	int is_user;

	if (!regs)
		return;

	is_user = user_mode(regs);

	if (!current || current->pid == 0)
		return;

	if (is_user && current->state != TASK_RUNNING)
		return;

	if (!is_user)
		perf_callchain_kernel(regs, entry);

	if (current->mm)
		perf_callchain_user(regs, entry);
}

struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
{
	struct perf_callchain_entry *entry;

	if (in_nmi())
2442
		entry = &__get_cpu_var(pmc_nmi_entry);
2443
	else
2444
		entry = &__get_cpu_var(pmc_irq_entry);
2445 2446 2447 2448 2449 2450 2451

	entry->nr = 0;

	perf_do_callchain(regs, entry);

	return entry;
}
2452

2453
void hw_perf_event_setup_online(int cpu)
2454 2455 2456
{
	init_debug_store_on_cpu(cpu);
}