lbr.c 26.6 KB
Newer Older
1 2 3 4 5
#include <linux/perf_event.h>
#include <linux/types.h>

#include <asm/perf_event.h>
#include <asm/msr.h>
6
#include <asm/insn.h>
7

8
#include "../perf_event.h"
9 10 11 12 13 14

enum {
	LBR_FORMAT_32		= 0x00,
	LBR_FORMAT_LIP		= 0x01,
	LBR_FORMAT_EIP		= 0x02,
	LBR_FORMAT_EIP_FLAGS	= 0x03,
15
	LBR_FORMAT_EIP_FLAGS2	= 0x04,
16 17
	LBR_FORMAT_INFO		= 0x05,
	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
18 19 20 21 22 23 24 25
};

static enum {
	LBR_EIP_FLAGS		= 1,
	LBR_TSX			= 2,
} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
	[LBR_FORMAT_EIP_FLAGS]  = LBR_EIP_FLAGS,
	[LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
26 27
};

28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
/*
 * Intel LBR_SELECT bits
 * Intel Vol3a, April 2011, Section 16.7 Table 16-10
 *
 * Hardware branch filter (not available on all CPUs)
 */
#define LBR_KERNEL_BIT		0 /* do not capture at ring0 */
#define LBR_USER_BIT		1 /* do not capture at ring > 0 */
#define LBR_JCC_BIT		2 /* do not capture conditional branches */
#define LBR_REL_CALL_BIT	3 /* do not capture relative calls */
#define LBR_IND_CALL_BIT	4 /* do not capture indirect calls */
#define LBR_RETURN_BIT		5 /* do not capture near returns */
#define LBR_IND_JMP_BIT		6 /* do not capture indirect jumps */
#define LBR_REL_JMP_BIT		7 /* do not capture relative jumps */
#define LBR_FAR_BIT		8 /* do not capture far branches */
43
#define LBR_CALL_STACK_BIT	9 /* enable call stack */
44

45 46 47 48 49 50 51
/*
 * Following bit only exists in Linux; we mask it out before writing it to
 * the actual MSR. But it helps the constraint perf code to understand
 * that this is a separate configuration.
 */
#define LBR_NO_INFO_BIT	       63 /* don't read LBR_INFO. */

52 53 54 55 56 57 58 59 60
#define LBR_KERNEL	(1 << LBR_KERNEL_BIT)
#define LBR_USER	(1 << LBR_USER_BIT)
#define LBR_JCC		(1 << LBR_JCC_BIT)
#define LBR_REL_CALL	(1 << LBR_REL_CALL_BIT)
#define LBR_IND_CALL	(1 << LBR_IND_CALL_BIT)
#define LBR_RETURN	(1 << LBR_RETURN_BIT)
#define LBR_REL_JMP	(1 << LBR_REL_JMP_BIT)
#define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT)
#define LBR_FAR		(1 << LBR_FAR_BIT)
61
#define LBR_CALL_STACK	(1 << LBR_CALL_STACK_BIT)
62
#define LBR_NO_INFO	(1ULL << LBR_NO_INFO_BIT)
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79

#define LBR_PLM (LBR_KERNEL | LBR_USER)

#define LBR_SEL_MASK	0x1ff	/* valid bits in LBR_SELECT */
#define LBR_NOT_SUPP	-1	/* LBR filter not supported */
#define LBR_IGN		0	/* ignored */

#define LBR_ANY		 \
	(LBR_JCC	|\
	 LBR_REL_CALL	|\
	 LBR_IND_CALL	|\
	 LBR_RETURN	|\
	 LBR_REL_JMP	|\
	 LBR_IND_JMP	|\
	 LBR_FAR)

#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
80 81
#define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
#define LBR_FROM_FLAG_ABORT    (1ULL << 61)
82

83 84 85 86 87
/*
 * x86control flow change classification
 * x86control flow changes include branches, interrupts, traps, faults
 */
enum {
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
	X86_BR_NONE		= 0,      /* unknown */

	X86_BR_USER		= 1 << 0, /* branch target is user */
	X86_BR_KERNEL		= 1 << 1, /* branch target is kernel */

	X86_BR_CALL		= 1 << 2, /* call */
	X86_BR_RET		= 1 << 3, /* return */
	X86_BR_SYSCALL		= 1 << 4, /* syscall */
	X86_BR_SYSRET		= 1 << 5, /* syscall return */
	X86_BR_INT		= 1 << 6, /* sw interrupt */
	X86_BR_IRET		= 1 << 7, /* return from interrupt */
	X86_BR_JCC		= 1 << 8, /* conditional */
	X86_BR_JMP		= 1 << 9, /* jump */
	X86_BR_IRQ		= 1 << 10,/* hw interrupt or trap or fault */
	X86_BR_IND_CALL		= 1 << 11,/* indirect calls */
	X86_BR_ABORT		= 1 << 12,/* transaction abort */
	X86_BR_IN_TX		= 1 << 13,/* in transaction */
	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
106 107
	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
108
	X86_BR_IND_JMP		= 1 << 17,/* indirect jump */
109 110 111
};

#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
112
#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
113 114 115 116 117 118 119 120 121 122 123

#define X86_BR_ANY       \
	(X86_BR_CALL    |\
	 X86_BR_RET     |\
	 X86_BR_SYSCALL |\
	 X86_BR_SYSRET  |\
	 X86_BR_INT     |\
	 X86_BR_IRET    |\
	 X86_BR_JCC     |\
	 X86_BR_JMP	 |\
	 X86_BR_IRQ	 |\
124
	 X86_BR_ABORT	 |\
125
	 X86_BR_IND_CALL |\
126
	 X86_BR_IND_JMP  |\
127
	 X86_BR_ZERO_CALL)
128 129 130 131 132 133

#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)

#define X86_BR_ANY_CALL		 \
	(X86_BR_CALL		|\
	 X86_BR_IND_CALL	|\
134
	 X86_BR_ZERO_CALL	|\
135 136 137 138 139 140
	 X86_BR_SYSCALL		|\
	 X86_BR_IRQ		|\
	 X86_BR_INT)

static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);

141 142 143 144 145
/*
 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
 * otherwise it becomes near impossible to get a reliable stack.
 */

146
static void __intel_pmu_lbr_enable(bool pmi)
147
{
148
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
149
	u64 debugctl, lbr_select = 0, orig_debugctl;
150

151 152 153 154 155 156 157
	/*
	 * No need to unfreeze manually, as v4 can do that as part
	 * of the GLOBAL_STATUS ack.
	 */
	if (pmi && x86_pmu.version >= 4)
		return;

158 159 160 161
	/*
	 * No need to reprogram LBR_SELECT in a PMI, as it
	 * did not change.
	 */
162
	if (cpuc->lbr_sel)
163
		lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
164
	if (!pmi && cpuc->lbr_sel)
165
		wrmsrl(MSR_LBR_SELECT, lbr_select);
166 167

	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
168
	orig_debugctl = debugctl;
169 170 171 172 173 174 175 176
	debugctl |= DEBUGCTLMSR_LBR;
	/*
	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
	 * may cause superfluous increase/decrease of LBR_TOS.
	 */
	if (!(lbr_select & LBR_CALL_STACK))
		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
177 178
	if (orig_debugctl != debugctl)
		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
179 180 181 182 183 184 185
}

static void __intel_pmu_lbr_disable(void)
{
	u64 debugctl;

	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
186
	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
}

static void intel_pmu_lbr_reset_32(void)
{
	int i;

	for (i = 0; i < x86_pmu.lbr_nr; i++)
		wrmsrl(x86_pmu.lbr_from + i, 0);
}

static void intel_pmu_lbr_reset_64(void)
{
	int i;

	for (i = 0; i < x86_pmu.lbr_nr; i++) {
		wrmsrl(x86_pmu.lbr_from + i, 0);
		wrmsrl(x86_pmu.lbr_to   + i, 0);
205 206
		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
			wrmsrl(MSR_LBR_INFO_0 + i, 0);
207 208 209
	}
}

210
void intel_pmu_lbr_reset(void)
211
{
212 213 214
	if (!x86_pmu.lbr_nr)
		return;

215
	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
216 217 218 219 220
		intel_pmu_lbr_reset_32();
	else
		intel_pmu_lbr_reset_64();
}

221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
/*
 * TOS = most recently recorded branch
 */
static inline u64 intel_pmu_lbr_tos(void)
{
	u64 tos;

	rdmsrl(x86_pmu.lbr_tos, tos);
	return tos;
}

enum {
	LBR_NONE,
	LBR_VALID,
};

static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
{
	int i;
	unsigned lbr_idx, mask;
	u64 tos;

	if (task_ctx->lbr_callstack_users == 0 ||
	    task_ctx->lbr_stack_state == LBR_NONE) {
		intel_pmu_lbr_reset();
		return;
	}

	mask = x86_pmu.lbr_nr - 1;
250
	tos = task_ctx->tos;
251
	for (i = 0; i < tos; i++) {
252 253 254
		lbr_idx = (tos - i) & mask;
		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
255
		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
256
			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
257
	}
258
	wrmsrl(x86_pmu.lbr_tos, tos);
259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
	task_ctx->lbr_stack_state = LBR_NONE;
}

static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
{
	int i;
	unsigned lbr_idx, mask;
	u64 tos;

	if (task_ctx->lbr_callstack_users == 0) {
		task_ctx->lbr_stack_state = LBR_NONE;
		return;
	}

	mask = x86_pmu.lbr_nr - 1;
	tos = intel_pmu_lbr_tos();
275
	for (i = 0; i < tos; i++) {
276 277 278
		lbr_idx = (tos - i) & mask;
		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
279
		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
280
			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
281
	}
282
	task_ctx->tos = tos;
283 284 285
	task_ctx->lbr_stack_state = LBR_VALID;
}

286 287 288
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
{
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
289
	struct x86_perf_task_context *task_ctx;
290

291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
	/*
	 * If LBR callstack feature is enabled and the stack was saved when
	 * the task was scheduled out, restore the stack. Otherwise flush
	 * the LBR stack.
	 */
	task_ctx = ctx ? ctx->task_ctx_data : NULL;
	if (task_ctx) {
		if (sched_in) {
			__intel_pmu_lbr_restore(task_ctx);
			cpuc->lbr_context = ctx;
		} else {
			__intel_pmu_lbr_save(task_ctx);
		}
		return;
	}

307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
	/*
	 * When sampling the branck stack in system-wide, it may be
	 * necessary to flush the stack on context switch. This happens
	 * when the branch stack does not tag its entries with the pid
	 * of the current task. Otherwise it becomes impossible to
	 * associate a branch entry with a task. This ambiguity is more
	 * likely to appear when the branch stack supports priv level
	 * filtering and the user sets it to monitor only at the user
	 * level (which could be a useful measurement in system-wide
	 * mode). In that case, the risk is high of having a branch
	 * stack with branch from multiple tasks.
 	 */
	if (sched_in) {
		intel_pmu_lbr_reset();
		cpuc->lbr_context = ctx;
	}
}

325 326 327 328 329
static inline bool branch_user_callstack(unsigned br_sel)
{
	return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
}

330
void intel_pmu_lbr_enable(struct perf_event *event)
331
{
332
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
333
	struct x86_perf_task_context *task_ctx;
334 335 336 337 338

	if (!x86_pmu.lbr_nr)
		return;

	/*
339 340
	 * Reset the LBR stack if we changed task context to
	 * avoid data leaks.
341
	 */
342
	if (event->ctx->task && cpuc->lbr_context != event->ctx) {
343 344 345
		intel_pmu_lbr_reset();
		cpuc->lbr_context = event->ctx;
	}
346
	cpuc->br_sel = event->hw.branch_reg.reg;
347

348 349 350 351 352 353
	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
					event->ctx->task_ctx_data) {
		task_ctx = event->ctx->task_ctx_data;
		task_ctx->lbr_callstack_users++;
	}

354
	cpuc->lbr_users++;
355
	perf_sched_cb_inc(event->ctx->pmu);
356 357
}

358
void intel_pmu_lbr_disable(struct perf_event *event)
359
{
360
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
361
	struct x86_perf_task_context *task_ctx;
362 363 364 365

	if (!x86_pmu.lbr_nr)
		return;

366 367 368 369 370 371
	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
					event->ctx->task_ctx_data) {
		task_ctx = event->ctx->task_ctx_data;
		task_ctx->lbr_callstack_users--;
	}

372
	cpuc->lbr_users--;
373
	WARN_ON_ONCE(cpuc->lbr_users < 0);
374
	perf_sched_cb_dec(event->ctx->pmu);
375

376
	if (cpuc->enabled && !cpuc->lbr_users) {
377
		__intel_pmu_lbr_disable();
378 379 380
		/* avoid stale pointer */
		cpuc->lbr_context = NULL;
	}
381 382
}

383
void intel_pmu_lbr_enable_all(bool pmi)
384
{
385
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
386 387

	if (cpuc->lbr_users)
388
		__intel_pmu_lbr_enable(pmi);
389 390
}

391
void intel_pmu_lbr_disable_all(void)
392
{
393
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
394 395 396 397 398 399 400 401 402 403 404

	if (cpuc->lbr_users)
		__intel_pmu_lbr_disable();
}

static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
{
	unsigned long mask = x86_pmu.lbr_nr - 1;
	u64 tos = intel_pmu_lbr_tos();
	int i;

P
Peter Zijlstra 已提交
405
	for (i = 0; i < x86_pmu.lbr_nr; i++) {
406 407 408 409 410 411 412 413 414 415 416
		unsigned long lbr_idx = (tos - i) & mask;
		union {
			struct {
				u32 from;
				u32 to;
			};
			u64     lbr;
		} msr_lastbranch;

		rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);

417 418 419 420 421
		cpuc->lbr_entries[i].from	= msr_lastbranch.from;
		cpuc->lbr_entries[i].to		= msr_lastbranch.to;
		cpuc->lbr_entries[i].mispred	= 0;
		cpuc->lbr_entries[i].predicted	= 0;
		cpuc->lbr_entries[i].reserved	= 0;
422 423 424 425 426 427 428 429 430 431 432
	}
	cpuc->lbr_stack.nr = i;
}

/*
 * Due to lack of segmentation in Linux the effective address (offset)
 * is the same as the linear address, allowing us to merge the LIP and EIP
 * LBR formats.
 */
static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
{
433
	bool need_info = false;
434
	unsigned long mask = x86_pmu.lbr_nr - 1;
435
	int lbr_format = x86_pmu.intel_cap.lbr_format;
436 437
	u64 tos = intel_pmu_lbr_tos();
	int i;
438
	int out = 0;
439
	int num = x86_pmu.lbr_nr;
440

441 442 443 444 445
	if (cpuc->lbr_sel) {
		need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
		if (cpuc->lbr_sel->config & LBR_CALL_STACK)
			num = tos;
	}
446 447

	for (i = 0; i < num; i++) {
448
		unsigned long lbr_idx = (tos - i) & mask;
449 450
		u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
		int skip = 0;
451
		u16 cycles = 0;
452
		int lbr_flags = lbr_desc[lbr_format];
453 454 455 456

		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);

457
		if (lbr_format == LBR_FORMAT_INFO && need_info) {
458 459 460 461 462 463 464 465 466
			u64 info;

			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
			mis = !!(info & LBR_INFO_MISPRED);
			pred = !mis;
			in_tx = !!(info & LBR_INFO_IN_TX);
			abort = !!(info & LBR_INFO_ABORT);
			cycles = (info & LBR_INFO_CYCLES);
		}
467
		if (lbr_flags & LBR_EIP_FLAGS) {
468 469
			mis = !!(from & LBR_FROM_FLAG_MISPRED);
			pred = !mis;
470 471 472 473 474 475
			skip = 1;
		}
		if (lbr_flags & LBR_TSX) {
			in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
			abort = !!(from & LBR_FROM_FLAG_ABORT);
			skip = 3;
476
		}
477
		from = (u64)((((s64)from) << skip) >> skip);
478

479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
		/*
		 * Some CPUs report duplicated abort records,
		 * with the second entry not having an abort bit set.
		 * Skip them here. This loop runs backwards,
		 * so we need to undo the previous record.
		 * If the abort just happened outside the window
		 * the extra entry cannot be removed.
		 */
		if (abort && x86_pmu.lbr_double_abort && out > 0)
			out--;

		cpuc->lbr_entries[out].from	 = from;
		cpuc->lbr_entries[out].to	 = to;
		cpuc->lbr_entries[out].mispred	 = mis;
		cpuc->lbr_entries[out].predicted = pred;
		cpuc->lbr_entries[out].in_tx	 = in_tx;
		cpuc->lbr_entries[out].abort	 = abort;
496
		cpuc->lbr_entries[out].cycles	 = cycles;
497 498
		cpuc->lbr_entries[out].reserved	 = 0;
		out++;
499
	}
500
	cpuc->lbr_stack.nr = out;
501 502
}

503
void intel_pmu_lbr_read(void)
504
{
505
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
506 507 508 509

	if (!cpuc->lbr_users)
		return;

510
	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
511 512 513
		intel_pmu_lbr_read_32(cpuc);
	else
		intel_pmu_lbr_read_64(cpuc);
514 515 516 517 518 519 520 521 522

	intel_pmu_lbr_filter(cpuc);
}

/*
 * SW filter is used:
 * - in case there is no HW filter
 * - in case the HW filter has errata or limitations
 */
523
static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
524 525 526 527 528 529 530
{
	u64 br_type = event->attr.branch_sample_type;
	int mask = 0;

	if (br_type & PERF_SAMPLE_BRANCH_USER)
		mask |= X86_BR_USER;

531
	if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
532 533 534 535 536 537 538 539 540 541 542 543 544 545 546
		mask |= X86_BR_KERNEL;

	/* we ignore BRANCH_HV here */

	if (br_type & PERF_SAMPLE_BRANCH_ANY)
		mask |= X86_BR_ANY;

	if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
		mask |= X86_BR_ANY_CALL;

	if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
		mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;

	if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
		mask |= X86_BR_IND_CALL;
547 548 549 550 551 552 553 554 555 556

	if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
		mask |= X86_BR_ABORT;

	if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
		mask |= X86_BR_IN_TX;

	if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
		mask |= X86_BR_NO_TX;

557 558 559
	if (br_type & PERF_SAMPLE_BRANCH_COND)
		mask |= X86_BR_JCC;

560 561 562 563 564 565 566 567 568
	if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
		if (!x86_pmu_has_lbr_callstack())
			return -EOPNOTSUPP;
		if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
			return -EINVAL;
		mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
			X86_BR_CALL_STACK;
	}

569 570 571
	if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
		mask |= X86_BR_IND_JMP;

572 573
	if (br_type & PERF_SAMPLE_BRANCH_CALL)
		mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
574 575 576 577 578
	/*
	 * stash actual user request into reg, it may
	 * be used by fixup code for some CPU
	 */
	event->hw.branch_reg.reg = mask;
579
	return 0;
580 581
}

582 583 584 585 586 587 588 589 590
/*
 * setup the HW LBR filter
 * Used only when available, may not be enough to disambiguate
 * all branches, may need the help of the SW filter
 */
static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
{
	struct hw_perf_event_extra *reg;
	u64 br_type = event->attr.branch_sample_type;
591 592
	u64 mask = 0, v;
	int i;
593

594
	for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
595
		if (!(br_type & (1ULL << i)))
596 597
			continue;

598
		v = x86_pmu.lbr_sel_map[i];
599 600 601
		if (v == LBR_NOT_SUPP)
			return -EOPNOTSUPP;

602 603
		if (v != LBR_IGN)
			mask |= v;
604
	}
605

606 607 608
	reg = &event->hw.branch_reg;
	reg->idx = EXTRA_REG_LBR;

609 610 611 612 613 614
	/*
	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
	 * in suppress mode. So LBR_SELECT should be set to
	 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
	 */
	reg->config = mask ^ x86_pmu.lbr_sel_mask;
615

616 617 618 619 620
	if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
	    (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
	    (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
		reg->config |= LBR_NO_INFO;

621 622 623 624 625
	return 0;
}

int intel_pmu_setup_lbr_filter(struct perf_event *event)
{
626
	int ret = 0;
627 628 629 630 631 632 633 634

	/*
	 * no LBR on this PMU
	 */
	if (!x86_pmu.lbr_nr)
		return -EOPNOTSUPP;

	/*
635
	 * setup SW LBR filter
636
	 */
637 638 639
	ret = intel_pmu_setup_sw_lbr_filter(event);
	if (ret)
		return ret;
640 641 642 643 644 645 646 647 648 649 650 651

	/*
	 * setup HW LBR filter, if any
	 */
	if (x86_pmu.lbr_sel_map)
		ret = intel_pmu_setup_hw_lbr_filter(event);

	return ret;
}

/*
 * return the type of control flow change at address "from"
652
 * instruction is not necessarily a branch (in case of interrupt).
653 654 655 656 657 658 659 660
 *
 * The branch type returned also includes the priv level of the
 * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
 *
 * If a branch type is unknown OR the instruction cannot be
 * decoded (e.g., text page not present), then X86_BR_NONE is
 * returned.
 */
661
static int branch_type(unsigned long from, unsigned long to, int abort)
662 663 664
{
	struct insn insn;
	void *addr;
665
	int bytes_read, bytes_left;
666 667 668 669 670 671 672 673 674 675 676 677 678 679 680
	int ret = X86_BR_NONE;
	int ext, to_plm, from_plm;
	u8 buf[MAX_INSN_SIZE];
	int is64 = 0;

	to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
	from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;

	/*
	 * maybe zero if lbr did not fill up after a reset by the time
	 * we get a PMU interrupt
	 */
	if (from == 0 || to == 0)
		return X86_BR_NONE;

681 682 683
	if (abort)
		return X86_BR_ABORT | to_plm;

684 685 686 687 688 689 690 691 692
	if (from_plm == X86_BR_USER) {
		/*
		 * can happen if measuring at the user level only
		 * and we interrupt in a kernel thread, e.g., idle.
		 */
		if (!current->mm)
			return X86_BR_NONE;

		/* may fail if text not present */
693 694 695 696
		bytes_left = copy_from_user_nmi(buf, (void __user *)from,
						MAX_INSN_SIZE);
		bytes_read = MAX_INSN_SIZE - bytes_left;
		if (!bytes_read)
697 698 699
			return X86_BR_NONE;

		addr = buf;
700 701 702 703 704 705 706
	} else {
		/*
		 * The LBR logs any address in the IP, even if the IP just
		 * faulted. This means userspace can control the from address.
		 * Ensure we don't blindy read any address by validating it is
		 * a known text address.
		 */
707
		if (kernel_text_address(from)) {
708
			addr = (void *)from;
709 710 711 712 713 714 715 716 717
			/*
			 * Assume we can get the maximum possible size
			 * when grabbing kernel data.  This is not
			 * _strictly_ true since we could possibly be
			 * executing up next to a memory hole, but
			 * it is very unlikely to be a problem.
			 */
			bytes_read = MAX_INSN_SIZE;
		} else {
718
			return X86_BR_NONE;
719
		}
720
	}
721 722 723 724 725 726 727 728

	/*
	 * decoder needs to know the ABI especially
	 * on 64-bit systems running 32-bit apps
	 */
#ifdef CONFIG_X86_64
	is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
#endif
729
	insn_init(&insn, addr, bytes_read, is64);
730
	insn_get_opcode(&insn);
731 732
	if (!insn.opcode.got)
		return X86_BR_ABORT;
733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767

	switch (insn.opcode.bytes[0]) {
	case 0xf:
		switch (insn.opcode.bytes[1]) {
		case 0x05: /* syscall */
		case 0x34: /* sysenter */
			ret = X86_BR_SYSCALL;
			break;
		case 0x07: /* sysret */
		case 0x35: /* sysexit */
			ret = X86_BR_SYSRET;
			break;
		case 0x80 ... 0x8f: /* conditional */
			ret = X86_BR_JCC;
			break;
		default:
			ret = X86_BR_NONE;
		}
		break;
	case 0x70 ... 0x7f: /* conditional */
		ret = X86_BR_JCC;
		break;
	case 0xc2: /* near ret */
	case 0xc3: /* near ret */
	case 0xca: /* far ret */
	case 0xcb: /* far ret */
		ret = X86_BR_RET;
		break;
	case 0xcf: /* iret */
		ret = X86_BR_IRET;
		break;
	case 0xcc ... 0xce: /* int */
		ret = X86_BR_INT;
		break;
	case 0xe8: /* call near rel */
768 769 770 771 772 773
		insn_get_immediate(&insn);
		if (insn.immediate1.value == 0) {
			/* zero length call */
			ret = X86_BR_ZERO_CALL;
			break;
		}
774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792
	case 0x9a: /* call far absolute */
		ret = X86_BR_CALL;
		break;
	case 0xe0 ... 0xe3: /* loop jmp */
		ret = X86_BR_JCC;
		break;
	case 0xe9 ... 0xeb: /* jmp */
		ret = X86_BR_JMP;
		break;
	case 0xff: /* call near absolute, call far absolute ind */
		insn_get_modrm(&insn);
		ext = (insn.modrm.bytes[0] >> 3) & 0x7;
		switch (ext) {
		case 2: /* near ind call */
		case 3: /* far ind call */
			ret = X86_BR_IND_CALL;
			break;
		case 4:
		case 5:
793
			ret = X86_BR_IND_JMP;
794 795 796 797 798
			break;
		}
		break;
	default:
		ret = X86_BR_NONE;
799 800
	}
	/*
801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817
	 * interrupts, traps, faults (and thus ring transition) may
	 * occur on any instructions. Thus, to classify them correctly,
	 * we need to first look at the from and to priv levels. If they
	 * are different and to is in the kernel, then it indicates
	 * a ring transition. If the from instruction is not a ring
	 * transition instr (syscall, systenter, int), then it means
	 * it was a irq, trap or fault.
	 *
	 * we have no way of detecting kernel to kernel faults.
	 */
	if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
	    && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
		ret = X86_BR_IRQ;

	/*
	 * branch priv level determined by target as
	 * is done by HW when LBR_SELECT is implemented
818
	 */
819 820
	if (ret != X86_BR_NONE)
		ret |= to_plm;
821

822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848
	return ret;
}

/*
 * implement actual branch filter based on user demand.
 * Hardware may not exactly satisfy that request, thus
 * we need to inspect opcodes. Mismatched branches are
 * discarded. Therefore, the number of branches returned
 * in PERF_SAMPLE_BRANCH_STACK sample may vary.
 */
static void
intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
{
	u64 from, to;
	int br_sel = cpuc->br_sel;
	int i, j, type;
	bool compress = false;

	/* if sampling all branches, then nothing to filter */
	if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
		return;

	for (i = 0; i < cpuc->lbr_stack.nr; i++) {

		from = cpuc->lbr_entries[i].from;
		to = cpuc->lbr_entries[i].to;

849 850 851 852 853 854 855
		type = branch_type(from, to, cpuc->lbr_entries[i].abort);
		if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
			if (cpuc->lbr_entries[i].in_tx)
				type |= X86_BR_IN_TX;
			else
				type |= X86_BR_NO_TX;
		}
856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878

		/* if type does not correspond, then discard */
		if (type == X86_BR_NONE || (br_sel & type) != type) {
			cpuc->lbr_entries[i].from = 0;
			compress = true;
		}
	}

	if (!compress)
		return;

	/* remove all entries with from=0 */
	for (i = 0; i < cpuc->lbr_stack.nr; ) {
		if (!cpuc->lbr_entries[i].from) {
			j = i;
			while (++j < cpuc->lbr_stack.nr)
				cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
			cpuc->lbr_stack.nr--;
			if (!cpuc->lbr_entries[i].from)
				continue;
		}
		i++;
	}
879 880
}

881 882 883
/*
 * Map interface branch filters onto LBR filters
 */
884
static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
885 886 887 888 889 890
	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_REL_JMP
						| LBR_IND_JMP | LBR_FAR,
891 892 893
	/*
	 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
	 */
894
	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
895 896 897 898
	 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
	/*
	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
	 */
899 900
	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
901
	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
902 903
};

904
static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
905 906 907 908 909 910 911 912 913
	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
						| LBR_FAR,
	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
914
	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
915
	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
916 917
};

918
static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
919 920 921 922 923 924 925 926 927 928 929
	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
						| LBR_FAR,
	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
						| LBR_RETURN | LBR_CALL_STACK,
930
	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
931
	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
932 933
};

934
/* core */
935
void __init intel_pmu_lbr_init_core(void)
936 937
{
	x86_pmu.lbr_nr     = 4;
938 939 940
	x86_pmu.lbr_tos    = MSR_LBR_TOS;
	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
941

942 943 944 945
	/*
	 * SW branch filter usage:
	 * - compensate for lack of HW filter
	 */
946
	pr_cont("4-deep LBR, ");
947 948
}

949
/* nehalem/westmere */
950
void __init intel_pmu_lbr_init_nhm(void)
951 952
{
	x86_pmu.lbr_nr     = 16;
953 954 955
	x86_pmu.lbr_tos    = MSR_LBR_TOS;
	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
956 957 958 959

	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
	x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;

960 961 962 963 964 965 966
	/*
	 * SW branch filter usage:
	 * - workaround LBR_SEL errata (see above)
	 * - support syscall, sysret capture.
	 *   That requires LBR_FAR but that means far
	 *   jmp need to be filtered out
	 */
967
	pr_cont("16-deep LBR, ");
968 969
}

970
/* sandy bridge */
971
void __init intel_pmu_lbr_init_snb(void)
972 973 974 975 976 977 978 979 980
{
	x86_pmu.lbr_nr	 = 16;
	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;

	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;

981 982 983 984 985 986
	/*
	 * SW branch filter usage:
	 * - support syscall, sysret capture.
	 *   That requires LBR_FAR but that means far
	 *   jmp need to be filtered out
	 */
987 988 989
	pr_cont("16-deep LBR, ");
}

990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003
/* haswell */
void intel_pmu_lbr_init_hsw(void)
{
	x86_pmu.lbr_nr	 = 16;
	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;

	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;

	pr_cont("16-deep LBR, ");
}

1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023
/* skylake */
__init void intel_pmu_lbr_init_skl(void)
{
	x86_pmu.lbr_nr	 = 32;
	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;

	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;

	/*
	 * SW branch filter usage:
	 * - support syscall, sysret capture.
	 *   That requires LBR_FAR but that means far
	 *   jmp need to be filtered out
	 */
	pr_cont("32-deep LBR, ");
}

1024
/* atom */
1025
void __init intel_pmu_lbr_init_atom(void)
1026
{
1027 1028 1029 1030 1031
	/*
	 * only models starting at stepping 10 seems
	 * to have an operational LBR which can freeze
	 * on PMU interrupt
	 */
1032 1033
	if (boot_cpu_data.x86_model == 28
	    && boot_cpu_data.x86_mask < 10) {
1034 1035 1036 1037
		pr_cont("LBR disabled due to erratum");
		return;
	}

1038
	x86_pmu.lbr_nr	   = 8;
1039 1040 1041
	x86_pmu.lbr_tos    = MSR_LBR_TOS;
	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
1042

1043 1044 1045 1046
	/*
	 * SW branch filter usage:
	 * - compensate for lack of HW filter
	 */
1047
	pr_cont("8-deep LBR, ");
1048
}
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062

/* Knights Landing */
void intel_pmu_lbr_init_knl(void)
{
	x86_pmu.lbr_nr	   = 8;
	x86_pmu.lbr_tos    = MSR_LBR_TOS;
	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;

	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;

	pr_cont("8-deep LBR, ");
}