core-book3s.c 44.0 KB
Newer Older
1
/*
2
 * Performance event support - powerpc architecture code
3 4 5 6 7 8 9 10 11 12
 *
 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */
#include <linux/kernel.h>
#include <linux/sched.h>
13
#include <linux/perf_event.h>
14 15
#include <linux/percpu.h>
#include <linux/hardirq.h>
16
#include <linux/uaccess.h>
17 18
#include <asm/reg.h>
#include <asm/pmc.h>
19
#include <asm/machdep.h>
20
#include <asm/firmware.h>
21
#include <asm/ptrace.h>
22
#include <asm/code-patching.h>
23

24 25 26 27 28
#define BHRB_MAX_ENTRIES	32
#define BHRB_TARGET		0x0000000000000002
#define BHRB_PREDICTION		0x0000000000000001
#define BHRB_EA			0xFFFFFFFFFFFFFFFC

29 30
struct cpu_hw_events {
	int n_events;
31 32 33
	int n_percpu;
	int disabled;
	int n_added;
34 35
	int n_limited;
	u8  pmcs_enabled;
36 37 38
	struct perf_event *event[MAX_HWEVENTS];
	u64 events[MAX_HWEVENTS];
	unsigned int flags[MAX_HWEVENTS];
39
	unsigned long mmcr[3];
40 41
	struct perf_event *limited_counter[MAX_LIMITED_HWCOUNTERS];
	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
42 43 44
	u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
	unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
	unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
45 46 47

	unsigned int group_flag;
	int n_txn_start;
48 49 50 51 52 53 54

	/* BHRB bits */
	u64				bhrb_filter;	/* BHRB HW branch filter */
	int				bhrb_users;
	void				*bhrb_context;
	struct	perf_branch_stack	bhrb_stack;
	struct	perf_branch_entry	bhrb_entries[BHRB_MAX_ENTRIES];
55
};
56

57
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
58 59 60

struct power_pmu *ppmu;

61
/*
I
Ingo Molnar 已提交
62
 * Normally, to ignore kernel events we set the FCS (freeze counters
63 64 65 66 67
 * in supervisor mode) bit in MMCR0, but if the kernel runs with the
 * hypervisor bit set in the MSR, or if we are running on a processor
 * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
 * then we need to use the FCHV bit to ignore kernel events.
 */
68
static unsigned int freeze_events_kernel = MMCR0_FCS;
69

70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
/*
 * 32-bit doesn't have MMCRA but does have an MMCR2,
 * and a few other names are different.
 */
#ifdef CONFIG_PPC32

#define MMCR0_FCHV		0
#define MMCR0_PMCjCE		MMCR0_PMCnCE

#define SPRN_MMCRA		SPRN_MMCR2
#define MMCRA_SAMPLE_ENABLE	0

static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
{
	return 0;
}
static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
static inline u32 perf_get_misc_flags(struct pt_regs *regs)
{
	return 0;
}
91 92 93 94
static inline void perf_read_regs(struct pt_regs *regs)
{
	regs->result = 0;
}
95 96 97 98 99
static inline int perf_intr_is_nmi(struct pt_regs *regs)
{
	return 0;
}

100 101 102 103 104
static inline int siar_valid(struct pt_regs *regs)
{
	return 1;
}

105 106 107 108
static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
void power_pmu_flush_branch_stack(void) {}
static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {}
109 110
#endif /* CONFIG_PPC32 */

111 112
static bool regs_use_siar(struct pt_regs *regs)
{
113
	return !!regs->result;
114 115
}

116 117 118 119 120 121 122 123 124
/*
 * Things that are specific to 64-bit implementations.
 */
#ifdef CONFIG_PPC64

static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
{
	unsigned long mmcra = regs->dsisr;

125
	if ((ppmu->flags & PPMU_HAS_SSLOT) && (mmcra & MMCRA_SAMPLE_ENABLE)) {
126 127 128 129
		unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
		if (slot > 1)
			return 4 * (slot - 1);
	}
130

131 132 133 134 135 136 137 138
	return 0;
}

/*
 * The user wants a data address recorded.
 * If we're not doing instruction sampling, give them the SDAR
 * (sampled data address).  If we are doing instruction sampling, then
 * only give them the SDAR if it corresponds to the instruction
139 140
 * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC or
 * the [POWER7P_]MMCRA_SDAR_VALID bit in MMCRA.
141 142 143 144
 */
static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
{
	unsigned long mmcra = regs->dsisr;
145 146 147 148 149 150 151 152
	unsigned long sdsync;

	if (ppmu->flags & PPMU_SIAR_VALID)
		sdsync = POWER7P_MMCRA_SDAR_VALID;
	else if (ppmu->flags & PPMU_ALT_SIPR)
		sdsync = POWER6_MMCRA_SDSYNC;
	else
		sdsync = MMCRA_SDSYNC;
153 154 155 156 157

	if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
		*addrp = mfspr(SPRN_SDAR);
}

158
static bool regs_sihv(struct pt_regs *regs)
159 160 161
{
	unsigned long sihv = MMCRA_SIHV;

162 163 164
	if (ppmu->flags & PPMU_HAS_SIER)
		return !!(regs->dar & SIER_SIHV);

165 166 167
	if (ppmu->flags & PPMU_ALT_SIPR)
		sihv = POWER6_MMCRA_SIHV;

168
	return !!(regs->dsisr & sihv);
169 170
}

171
static bool regs_sipr(struct pt_regs *regs)
172 173 174
{
	unsigned long sipr = MMCRA_SIPR;

175 176 177
	if (ppmu->flags & PPMU_HAS_SIER)
		return !!(regs->dar & SIER_SIPR);

178 179 180
	if (ppmu->flags & PPMU_ALT_SIPR)
		sipr = POWER6_MMCRA_SIPR;

181
	return !!(regs->dsisr & sipr);
182 183
}

184 185 186 187 188 189 190 191 192
static inline u32 perf_flags_from_msr(struct pt_regs *regs)
{
	if (regs->msr & MSR_PR)
		return PERF_RECORD_MISC_USER;
	if ((regs->msr & MSR_HV) && freeze_events_kernel != MMCR0_FCHV)
		return PERF_RECORD_MISC_HYPERVISOR;
	return PERF_RECORD_MISC_KERNEL;
}

193 194
static inline u32 perf_get_misc_flags(struct pt_regs *regs)
{
195
	bool use_siar = regs_use_siar(regs);
196

197
	if (!use_siar)
198 199 200 201 202 203 204 205
		return perf_flags_from_msr(regs);

	/*
	 * If we don't have flags in MMCRA, rather than using
	 * the MSR, we intuit the flags from the address in
	 * SIAR which should give slightly more reliable
	 * results
	 */
206
	if (ppmu->flags & PPMU_NO_SIPR) {
207 208 209 210 211
		unsigned long siar = mfspr(SPRN_SIAR);
		if (siar >= PAGE_OFFSET)
			return PERF_RECORD_MISC_KERNEL;
		return PERF_RECORD_MISC_USER;
	}
212

213
	/* PR has priority over HV, so order below is important */
214
	if (regs_sipr(regs))
215
		return PERF_RECORD_MISC_USER;
216 217

	if (regs_sihv(regs) && (freeze_events_kernel != MMCR0_FCHV))
218
		return PERF_RECORD_MISC_HYPERVISOR;
219

220
	return PERF_RECORD_MISC_KERNEL;
221 222 223 224 225
}

/*
 * Overload regs->dsisr to store MMCRA so we only need to read it once
 * on each interrupt.
226
 * Overload regs->dar to store SIER if we have it.
227 228
 * Overload regs->result to specify whether we should use the MSR (result
 * is zero) or the SIAR (result is non zero).
229 230 231
 */
static inline void perf_read_regs(struct pt_regs *regs)
{
232 233 234 235
	unsigned long mmcra = mfspr(SPRN_MMCRA);
	int marked = mmcra & MMCRA_SAMPLE_ENABLE;
	int use_siar;

236
	regs->dsisr = mmcra;
237

238 239
	if (ppmu->flags & PPMU_HAS_SIER)
		regs->dar = mfspr(SPRN_SIER);
240

241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
	/*
	 * If this isn't a PMU exception (eg a software event) the SIAR is
	 * not valid. Use pt_regs.
	 *
	 * If it is a marked event use the SIAR.
	 *
	 * If the PMU doesn't update the SIAR for non marked events use
	 * pt_regs.
	 *
	 * If the PMU has HV/PR flags then check to see if they
	 * place the exception in userspace. If so, use pt_regs. In
	 * continuous sampling mode the SIAR and the PMU exception are
	 * not synchronised, so they may be many instructions apart.
	 * This can result in confusing backtraces. We still want
	 * hypervisor samples as well as samples in the kernel with
	 * interrupts off hence the userspace check.
	 */
258 259
	if (TRAP(regs) != 0xf00)
		use_siar = 0;
260 261 262 263
	else if (marked)
		use_siar = 1;
	else if ((ppmu->flags & PPMU_NO_CONT_SAMPLING))
		use_siar = 0;
264
	else if (!(ppmu->flags & PPMU_NO_SIPR) && regs_sipr(regs))
265 266 267 268
		use_siar = 0;
	else
		use_siar = 1;

269
	regs->result = use_siar;
270 271 272 273 274 275 276 277 278 279 280
}

/*
 * If interrupts were soft-disabled when a PMU interrupt occurs, treat
 * it as an NMI.
 */
static inline int perf_intr_is_nmi(struct pt_regs *regs)
{
	return !regs->softe;
}

281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
/*
 * On processors like P7+ that have the SIAR-Valid bit, marked instructions
 * must be sampled only if the SIAR-valid bit is set.
 *
 * For unmarked instructions and for processors that don't have the SIAR-Valid
 * bit, assume that SIAR is valid.
 */
static inline int siar_valid(struct pt_regs *regs)
{
	unsigned long mmcra = regs->dsisr;
	int marked = mmcra & MMCRA_SAMPLE_ENABLE;

	if ((ppmu->flags & PPMU_SIAR_VALID) && marked)
		return mmcra & POWER7P_MMCRA_SIAR_VALID;

	return 1;
}

299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348

/* Reset all possible BHRB entries */
static void power_pmu_bhrb_reset(void)
{
	asm volatile(PPC_CLRBHRB);
}

static void power_pmu_bhrb_enable(struct perf_event *event)
{
	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);

	if (!ppmu->bhrb_nr)
		return;

	/* Clear BHRB if we changed task context to avoid data leaks */
	if (event->ctx->task && cpuhw->bhrb_context != event->ctx) {
		power_pmu_bhrb_reset();
		cpuhw->bhrb_context = event->ctx;
	}
	cpuhw->bhrb_users++;
}

static void power_pmu_bhrb_disable(struct perf_event *event)
{
	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);

	if (!ppmu->bhrb_nr)
		return;

	cpuhw->bhrb_users--;
	WARN_ON_ONCE(cpuhw->bhrb_users < 0);

	if (!cpuhw->disabled && !cpuhw->bhrb_users) {
		/* BHRB cannot be turned off when other
		 * events are active on the PMU.
		 */

		/* avoid stale pointer */
		cpuhw->bhrb_context = NULL;
	}
}

/* Called from ctxsw to prevent one process's branch entries to
 * mingle with the other process's entries during context switch.
 */
void power_pmu_flush_branch_stack(void)
{
	if (ppmu->bhrb_nr)
		power_pmu_bhrb_reset();
}
349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
/* Calculate the to address for a branch */
static __u64 power_pmu_bhrb_to(u64 addr)
{
	unsigned int instr;
	int ret;
	__u64 target;

	if (is_kernel_addr(addr))
		return branch_target((unsigned int *)addr);

	/* Userspace: need copy instruction here then translate it */
	pagefault_disable();
	ret = __get_user_inatomic(instr, (unsigned int __user *)addr);
	if (ret) {
		pagefault_enable();
		return 0;
	}
	pagefault_enable();

	target = branch_target(&instr);
	if ((!target) || (instr & BRANCH_ABSOLUTE))
		return target;

	/* Translate relative branch target from kernel to user address */
	return target - (unsigned long)&instr + addr;
}
375 376

/* Processing BHRB entries */
377
void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw)
378 379 380
{
	u64 val;
	u64 addr;
381
	int r_index, u_index, pred;
382 383 384 385 386

	r_index = 0;
	u_index = 0;
	while (r_index < ppmu->bhrb_nr) {
		/* Assembly read function */
387 388 389
		val = read_bhrb(r_index++);
		if (!val)
			/* Terminal marker: End of valid BHRB entries */
390
			break;
391
		else {
392 393 394
			addr = val & BHRB_EA;
			pred = val & BHRB_PREDICTION;

395 396
			if (!addr)
				/* invalid entry */
397 398
				continue;

399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
			/* Branches are read most recent first (ie. mfbhrb 0 is
			 * the most recent branch).
			 * There are two types of valid entries:
			 * 1) a target entry which is the to address of a
			 *    computed goto like a blr,bctr,btar.  The next
			 *    entry read from the bhrb will be branch
			 *    corresponding to this target (ie. the actual
			 *    blr/bctr/btar instruction).
			 * 2) a from address which is an actual branch.  If a
			 *    target entry proceeds this, then this is the
			 *    matching branch for that target.  If this is not
			 *    following a target entry, then this is a branch
			 *    where the target is given as an immediate field
			 *    in the instruction (ie. an i or b form branch).
			 *    In this case we need to read the instruction from
			 *    memory to determine the target/to address.
			 */
416 417

			if (val & BHRB_TARGET) {
418 419 420 421 422 423
				/* Target branches use two entries
				 * (ie. computed gotos/XL form)
				 */
				cpuhw->bhrb_entries[u_index].to = addr;
				cpuhw->bhrb_entries[u_index].mispred = pred;
				cpuhw->bhrb_entries[u_index].predicted = ~pred;
424

425 426 427 428 429 430 431 432 433 434
				/* Get from address in next entry */
				val = read_bhrb(r_index++);
				addr = val & BHRB_EA;
				if (val & BHRB_TARGET) {
					/* Shouldn't have two targets in a
					   row.. Reset index and try again */
					r_index--;
					addr = 0;
				}
				cpuhw->bhrb_entries[u_index].from = addr;
435
			} else {
436 437
				/* Branches to immediate field 
				   (ie I or B form) */
438
				cpuhw->bhrb_entries[u_index].from = addr;
439 440
				cpuhw->bhrb_entries[u_index].to =
					power_pmu_bhrb_to(addr);
441 442 443
				cpuhw->bhrb_entries[u_index].mispred = pred;
				cpuhw->bhrb_entries[u_index].predicted = ~pred;
			}
444 445
			u_index++;

446 447 448 449 450 451
		}
	}
	cpuhw->bhrb_stack.nr = u_index;
	return;
}

452 453
#endif /* CONFIG_PPC64 */

454
static void perf_event_interrupt(struct pt_regs *regs);
455

456
void perf_event_print_debug(void)
457 458 459 460
{
}

/*
I
Ingo Molnar 已提交
461
 * Read one performance monitor counter (PMC).
462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485
 */
static unsigned long read_pmc(int idx)
{
	unsigned long val;

	switch (idx) {
	case 1:
		val = mfspr(SPRN_PMC1);
		break;
	case 2:
		val = mfspr(SPRN_PMC2);
		break;
	case 3:
		val = mfspr(SPRN_PMC3);
		break;
	case 4:
		val = mfspr(SPRN_PMC4);
		break;
	case 5:
		val = mfspr(SPRN_PMC5);
		break;
	case 6:
		val = mfspr(SPRN_PMC6);
		break;
486
#ifdef CONFIG_PPC64
487 488 489 490 491 492
	case 7:
		val = mfspr(SPRN_PMC7);
		break;
	case 8:
		val = mfspr(SPRN_PMC8);
		break;
493
#endif /* CONFIG_PPC64 */
494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
	default:
		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
		val = 0;
	}
	return val;
}

/*
 * Write one PMC.
 */
static void write_pmc(int idx, unsigned long val)
{
	switch (idx) {
	case 1:
		mtspr(SPRN_PMC1, val);
		break;
	case 2:
		mtspr(SPRN_PMC2, val);
		break;
	case 3:
		mtspr(SPRN_PMC3, val);
		break;
	case 4:
		mtspr(SPRN_PMC4, val);
		break;
	case 5:
		mtspr(SPRN_PMC5, val);
		break;
	case 6:
		mtspr(SPRN_PMC6, val);
		break;
525
#ifdef CONFIG_PPC64
526 527 528 529 530 531
	case 7:
		mtspr(SPRN_PMC7, val);
		break;
	case 8:
		mtspr(SPRN_PMC8, val);
		break;
532
#endif /* CONFIG_PPC64 */
533 534 535 536 537 538 539 540 541
	default:
		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
	}
}

/*
 * Check if a set of events can all go on the PMU at once.
 * If they can't, this will look at alternative codes for the events
 * and see if any combination of alternative codes is feasible.
542
 * The feasible set is returned in event_id[].
543
 */
544 545
static int power_check_constraints(struct cpu_hw_events *cpuhw,
				   u64 event_id[], unsigned int cflags[],
546
				   int n_ev)
547
{
548
	unsigned long mask, value, nv;
549 550
	unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS];
	int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS];
551
	int i, j;
552 553
	unsigned long addf = ppmu->add_fields;
	unsigned long tadd = ppmu->test_adder;
554

555
	if (n_ev > ppmu->n_counter)
556 557 558 559
		return -1;

	/* First see if the events will go on as-is */
	for (i = 0; i < n_ev; ++i) {
560
		if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
561 562
		    && !ppmu->limited_pmc_event(event_id[i])) {
			ppmu->get_alternatives(event_id[i], cflags[i],
563
					       cpuhw->alternatives[i]);
564
			event_id[i] = cpuhw->alternatives[i][0];
565
		}
566
		if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0],
567
					 &cpuhw->avalues[i][0]))
568 569 570 571
			return -1;
	}
	value = mask = 0;
	for (i = 0; i < n_ev; ++i) {
572 573
		nv = (value | cpuhw->avalues[i][0]) +
			(value & cpuhw->avalues[i][0] & addf);
574
		if ((((nv + tadd) ^ value) & mask) != 0 ||
575 576
		    (((nv + tadd) ^ cpuhw->avalues[i][0]) &
		     cpuhw->amasks[i][0]) != 0)
577 578
			break;
		value = nv;
579
		mask |= cpuhw->amasks[i][0];
580 581 582 583 584 585 586 587
	}
	if (i == n_ev)
		return 0;	/* all OK */

	/* doesn't work, gather alternatives... */
	if (!ppmu->get_alternatives)
		return -1;
	for (i = 0; i < n_ev; ++i) {
588
		choice[i] = 0;
589
		n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i],
590
						  cpuhw->alternatives[i]);
591
		for (j = 1; j < n_alt[i]; ++j)
592 593 594
			ppmu->get_constraint(cpuhw->alternatives[i][j],
					     &cpuhw->amasks[i][j],
					     &cpuhw->avalues[i][j]);
595 596 597 598 599 600 601 602 603 604 605 606 607 608
	}

	/* enumerate all possibilities and see if any will work */
	i = 0;
	j = -1;
	value = mask = nv = 0;
	while (i < n_ev) {
		if (j >= 0) {
			/* we're backtracking, restore context */
			value = svalues[i];
			mask = smasks[i];
			j = choice[i];
		}
		/*
609
		 * See if any alternative k for event_id i,
610 611 612
		 * where k > j, will satisfy the constraints.
		 */
		while (++j < n_alt[i]) {
613 614
			nv = (value | cpuhw->avalues[i][j]) +
				(value & cpuhw->avalues[i][j] & addf);
615
			if ((((nv + tadd) ^ value) & mask) == 0 &&
616 617
			    (((nv + tadd) ^ cpuhw->avalues[i][j])
			     & cpuhw->amasks[i][j]) == 0)
618 619 620 621 622
				break;
		}
		if (j >= n_alt[i]) {
			/*
			 * No feasible alternative, backtrack
623
			 * to event_id i-1 and continue enumerating its
624 625 626 627 628 629
			 * alternatives from where we got up to.
			 */
			if (--i < 0)
				return -1;
		} else {
			/*
630 631 632
			 * Found a feasible alternative for event_id i,
			 * remember where we got up to with this event_id,
			 * go on to the next event_id, and start with
633 634 635 636 637 638
			 * the first alternative for it.
			 */
			choice[i] = j;
			svalues[i] = value;
			smasks[i] = mask;
			value = nv;
639
			mask |= cpuhw->amasks[i][j];
640 641 642 643 644 645 646
			++i;
			j = -1;
		}
	}

	/* OK, we have a feasible combination, tell the caller the solution */
	for (i = 0; i < n_ev; ++i)
647
		event_id[i] = cpuhw->alternatives[i][choice[i]];
648 649 650
	return 0;
}

651
/*
652
 * Check if newly-added events have consistent settings for
653
 * exclude_{user,kernel,hv} with each other and any previously
654
 * added events.
655
 */
656
static int check_excludes(struct perf_event **ctrs, unsigned int cflags[],
657
			  int n_prev, int n_new)
658
{
659 660
	int eu = 0, ek = 0, eh = 0;
	int i, n, first;
661
	struct perf_event *event;
662 663 664 665 666

	n = n_prev + n_new;
	if (n <= 1)
		return 0;

667 668 669 670 671 672
	first = 1;
	for (i = 0; i < n; ++i) {
		if (cflags[i] & PPMU_LIMITED_PMC_OK) {
			cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
			continue;
		}
673
		event = ctrs[i];
674
		if (first) {
675 676 677
			eu = event->attr.exclude_user;
			ek = event->attr.exclude_kernel;
			eh = event->attr.exclude_hv;
678
			first = 0;
679 680 681
		} else if (event->attr.exclude_user != eu ||
			   event->attr.exclude_kernel != ek ||
			   event->attr.exclude_hv != eh) {
682
			return -EAGAIN;
683
		}
684
	}
685 686 687 688 689 690

	if (eu || ek || eh)
		for (i = 0; i < n; ++i)
			if (cflags[i] & PPMU_LIMITED_PMC_OK)
				cflags[i] |= PPMU_LIMITED_PMC_REQD;

691 692 693
	return 0;
}

694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712
static u64 check_and_compute_delta(u64 prev, u64 val)
{
	u64 delta = (val - prev) & 0xfffffffful;

	/*
	 * POWER7 can roll back counter values, if the new value is smaller
	 * than the previous value it will cause the delta and the counter to
	 * have bogus values unless we rolled a counter over.  If a coutner is
	 * rolled back, it will be smaller, but within 256, which is the maximum
	 * number of events to rollback at once.  If we dectect a rollback
	 * return 0.  This can lead to a small lack of precision in the
	 * counters.
	 */
	if (prev > val && (prev - val) < 256)
		delta = 0;

	return delta;
}

713
static void power_pmu_read(struct perf_event *event)
714
{
715
	s64 val, delta, prev;
716

P
Peter Zijlstra 已提交
717 718 719
	if (event->hw.state & PERF_HES_STOPPED)
		return;

720
	if (!event->hw.idx)
721 722 723 724 725 726 727
		return;
	/*
	 * Performance monitor interrupts come even when interrupts
	 * are soft-disabled, as long as interrupts are hard-enabled.
	 * Therefore we treat them like NMIs.
	 */
	do {
728
		prev = local64_read(&event->hw.prev_count);
729
		barrier();
730
		val = read_pmc(event->hw.idx);
731 732 733
		delta = check_and_compute_delta(prev, val);
		if (!delta)
			return;
734
	} while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
735

736 737
	local64_add(delta, &event->count);
	local64_sub(delta, &event->hw.period_left);
738 739
}

740 741 742
/*
 * On some machines, PMC5 and PMC6 can't be written, don't respect
 * the freeze conditions, and don't generate interrupts.  This tells
743
 * us if `event' is using such a PMC.
744 745 746
 */
static int is_limited_pmc(int pmcnum)
{
747 748
	return (ppmu->flags & PPMU_LIMITED_PMC5_6)
		&& (pmcnum == 5 || pmcnum == 6);
749 750
}

751
static void freeze_limited_counters(struct cpu_hw_events *cpuhw,
752 753
				    unsigned long pmc5, unsigned long pmc6)
{
754
	struct perf_event *event;
755 756 757 758
	u64 val, prev, delta;
	int i;

	for (i = 0; i < cpuhw->n_limited; ++i) {
759
		event = cpuhw->limited_counter[i];
760
		if (!event->hw.idx)
761
			continue;
762
		val = (event->hw.idx == 5) ? pmc5 : pmc6;
763
		prev = local64_read(&event->hw.prev_count);
764
		event->hw.idx = 0;
765 766 767
		delta = check_and_compute_delta(prev, val);
		if (delta)
			local64_add(delta, &event->count);
768 769 770
	}
}

771
static void thaw_limited_counters(struct cpu_hw_events *cpuhw,
772 773
				  unsigned long pmc5, unsigned long pmc6)
{
774
	struct perf_event *event;
775
	u64 val, prev;
776 777 778
	int i;

	for (i = 0; i < cpuhw->n_limited; ++i) {
779
		event = cpuhw->limited_counter[i];
780 781
		event->hw.idx = cpuhw->limited_hwidx[i];
		val = (event->hw.idx == 5) ? pmc5 : pmc6;
782 783 784
		prev = local64_read(&event->hw.prev_count);
		if (check_and_compute_delta(prev, val))
			local64_set(&event->hw.prev_count, val);
785
		perf_event_update_userpage(event);
786 787 788 789
	}
}

/*
790
 * Since limited events don't respect the freeze conditions, we
791
 * have to read them immediately after freezing or unfreezing the
792 793
 * other events.  We try to keep the values from the limited
 * events as consistent as possible by keeping the delay (in
794
 * cycles and instructions) between freezing/unfreezing and reading
795 796
 * the limited events as small and consistent as possible.
 * Therefore, if any limited events are in use, we read them
797 798 799
 * both, and always in the same order, to minimize variability,
 * and do it inside the same asm that writes MMCR0.
 */
800
static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
801 802 803 804 805 806 807 808 809 810
{
	unsigned long pmc5, pmc6;

	if (!cpuhw->n_limited) {
		mtspr(SPRN_MMCR0, mmcr0);
		return;
	}

	/*
	 * Write MMCR0, then read PMC5 and PMC6 immediately.
811 812
	 * To ensure we don't get a performance monitor interrupt
	 * between writing MMCR0 and freezing/thawing the limited
813
	 * events, we first write MMCR0 with the event overflow
814
	 * interrupt enable bits turned off.
815 816 817
	 */
	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
		     : "=&r" (pmc5), "=&r" (pmc6)
818 819
		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
		       "i" (SPRN_MMCR0),
820 821 822
		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));

	if (mmcr0 & MMCR0_FC)
823
		freeze_limited_counters(cpuhw, pmc5, pmc6);
824
	else
825
		thaw_limited_counters(cpuhw, pmc5, pmc6);
826 827

	/*
828
	 * Write the full MMCR0 including the event overflow interrupt
829 830 831 832
	 * enable bits, if necessary.
	 */
	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
		mtspr(SPRN_MMCR0, mmcr0);
833 834
}

835
/*
836 837
 * Disable all events to prevent PMU interrupts and to allow
 * events to be added or removed.
838
 */
P
Peter Zijlstra 已提交
839
static void power_pmu_disable(struct pmu *pmu)
840
{
841
	struct cpu_hw_events *cpuhw;
842 843
	unsigned long flags;

844 845
	if (!ppmu)
		return;
846
	local_irq_save(flags);
847
	cpuhw = &__get_cpu_var(cpu_hw_events);
848

849
	if (!cpuhw->disabled) {
850 851 852
		cpuhw->disabled = 1;
		cpuhw->n_added = 0;

853 854 855 856
		/*
		 * Check if we ever enabled the PMU on this cpu.
		 */
		if (!cpuhw->pmcs_enabled) {
857
			ppc_enable_pmcs();
858 859 860
			cpuhw->pmcs_enabled = 1;
		}

861 862 863 864 865 866 867 868 869
		/*
		 * Disable instruction sampling if it was enabled
		 */
		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
			mtspr(SPRN_MMCRA,
			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
			mb();
		}

870
		/*
I
Ingo Molnar 已提交
871
		 * Set the 'freeze counters' bit.
872
		 * The barrier is to make sure the mtspr has been
873
		 * executed and the PMU has frozen the events
874 875
		 * before we return.
		 */
876
		write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
877 878 879 880 881 882
		mb();
	}
	local_irq_restore(flags);
}

/*
883 884
 * Re-enable all events if disable == 0.
 * If we were previously disabled and events were added, then
885 886
 * put the new config on the PMU.
 */
P
Peter Zijlstra 已提交
887
static void power_pmu_enable(struct pmu *pmu)
888
{
889 890
	struct perf_event *event;
	struct cpu_hw_events *cpuhw;
891 892 893 894
	unsigned long flags;
	long i;
	unsigned long val;
	s64 left;
895
	unsigned int hwc_index[MAX_HWEVENTS];
896 897
	int n_lim;
	int idx;
898

899 900
	if (!ppmu)
		return;
901
	local_irq_save(flags);
902
	cpuhw = &__get_cpu_var(cpu_hw_events);
903 904 905 906
	if (!cpuhw->disabled) {
		local_irq_restore(flags);
		return;
	}
907 908 909
	cpuhw->disabled = 0;

	/*
910
	 * If we didn't change anything, or only removed events,
911 912
	 * no need to recalculate MMCR* settings and reset the PMCs.
	 * Just reenable the PMU with the current MMCR* settings
913
	 * (possibly updated for removal of events).
914 915
	 */
	if (!cpuhw->n_added) {
916
		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
917
		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
918
		if (cpuhw->n_events == 0)
919
			ppc_set_pmu_inuse(0);
920
		goto out_enable;
921 922 923
	}

	/*
924
	 * Compute MMCR* values for the new set of events
925
	 */
926
	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
927 928 929 930 931 932
			       cpuhw->mmcr)) {
		/* shouldn't ever get here */
		printk(KERN_ERR "oops compute_mmcr failed\n");
		goto out;
	}

933 934
	/*
	 * Add in MMCR0 freeze bits corresponding to the
935 936 937
	 * attr.exclude_* bits for the first event.
	 * We have already checked that all events have the
	 * same values for these bits as the first event.
938
	 */
939 940
	event = cpuhw->event[0];
	if (event->attr.exclude_user)
941
		cpuhw->mmcr[0] |= MMCR0_FCP;
942 943 944
	if (event->attr.exclude_kernel)
		cpuhw->mmcr[0] |= freeze_events_kernel;
	if (event->attr.exclude_hv)
945 946
		cpuhw->mmcr[0] |= MMCR0_FCHV;

947 948
	/*
	 * Write the new configuration to MMCR* with the freeze
949 950
	 * bit set and set the hardware events to their initial values.
	 * Then unfreeze the events.
951
	 */
952
	ppc_set_pmu_inuse(1);
953
	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
954 955 956 957 958
	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
				| MMCR0_FC);

	/*
959
	 * Read off any pre-existing events that need to move
960 961
	 * to another PMC.
	 */
962 963 964 965 966 967
	for (i = 0; i < cpuhw->n_events; ++i) {
		event = cpuhw->event[i];
		if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) {
			power_pmu_read(event);
			write_pmc(event->hw.idx, 0);
			event->hw.idx = 0;
968 969 970 971
		}
	}

	/*
972
	 * Initialize the PMCs for all the new and moved events.
973
	 */
974
	cpuhw->n_limited = n_lim = 0;
975 976 977
	for (i = 0; i < cpuhw->n_events; ++i) {
		event = cpuhw->event[i];
		if (event->hw.idx)
978
			continue;
979 980
		idx = hwc_index[i] + 1;
		if (is_limited_pmc(idx)) {
981
			cpuhw->limited_counter[n_lim] = event;
982 983 984 985
			cpuhw->limited_hwidx[n_lim] = idx;
			++n_lim;
			continue;
		}
986
		val = 0;
987
		if (event->hw.sample_period) {
988
			left = local64_read(&event->hw.period_left);
989 990 991
			if (left < 0x80000000L)
				val = 0x80000000L - left;
		}
992
		local64_set(&event->hw.prev_count, val);
993
		event->hw.idx = idx;
P
Peter Zijlstra 已提交
994 995
		if (event->hw.state & PERF_HES_STOPPED)
			val = 0;
996
		write_pmc(idx, val);
997
		perf_event_update_userpage(event);
998
	}
999
	cpuhw->n_limited = n_lim;
1000
	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
1001 1002 1003

 out_enable:
	mb();
1004
	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1005

1006 1007 1008 1009 1010 1011 1012 1013
	/*
	 * Enable instruction sampling if necessary
	 */
	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
		mb();
		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
	}

1014
 out:
1015 1016 1017
	if (cpuhw->bhrb_users)
		ppmu->config_bhrb(cpuhw->bhrb_filter);

1018 1019 1020
	local_irq_restore(flags);
}

1021 1022
static int collect_events(struct perf_event *group, int max_count,
			  struct perf_event *ctrs[], u64 *events,
1023
			  unsigned int *flags)
1024 1025
{
	int n = 0;
1026
	struct perf_event *event;
1027

1028
	if (!is_software_event(group)) {
1029 1030 1031
		if (n >= max_count)
			return -1;
		ctrs[n] = group;
1032
		flags[n] = group->hw.event_base;
1033 1034
		events[n++] = group->hw.config;
	}
1035
	list_for_each_entry(event, &group->sibling_list, group_entry) {
1036 1037
		if (!is_software_event(event) &&
		    event->state != PERF_EVENT_STATE_OFF) {
1038 1039
			if (n >= max_count)
				return -1;
1040 1041 1042
			ctrs[n] = event;
			flags[n] = event->hw.event_base;
			events[n++] = event->hw.config;
1043 1044 1045 1046 1047 1048
		}
	}
	return n;
}

/*
1049 1050
 * Add a event to the PMU.
 * If all events are not already frozen, then we disable and
1051
 * re-enable the PMU in order to get hw_perf_enable to do the
1052 1053
 * actual work of reconfiguring the PMU.
 */
P
Peter Zijlstra 已提交
1054
static int power_pmu_add(struct perf_event *event, int ef_flags)
1055
{
1056
	struct cpu_hw_events *cpuhw;
1057 1058 1059 1060 1061
	unsigned long flags;
	int n0;
	int ret = -EAGAIN;

	local_irq_save(flags);
P
Peter Zijlstra 已提交
1062
	perf_pmu_disable(event->pmu);
1063 1064

	/*
1065
	 * Add the event to the list (if there is room)
1066 1067
	 * and check whether the total set is still feasible.
	 */
1068 1069
	cpuhw = &__get_cpu_var(cpu_hw_events);
	n0 = cpuhw->n_events;
1070
	if (n0 >= ppmu->n_counter)
1071
		goto out;
1072 1073 1074
	cpuhw->event[n0] = event;
	cpuhw->events[n0] = event->hw.config;
	cpuhw->flags[n0] = event->hw.event_base;
1075

1076 1077 1078 1079 1080 1081
	/*
	 * This event may have been disabled/stopped in record_and_restart()
	 * because we exceeded the ->event_limit. If re-starting the event,
	 * clear the ->hw.state (STOPPED and UPTODATE flags), so the user
	 * notification is re-enabled.
	 */
P
Peter Zijlstra 已提交
1082 1083
	if (!(ef_flags & PERF_EF_START))
		event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
1084 1085
	else
		event->hw.state = 0;
P
Peter Zijlstra 已提交
1086

1087 1088
	/*
	 * If group events scheduling transaction was started,
L
Lucas De Marchi 已提交
1089
	 * skip the schedulability test here, it will be performed
1090 1091
	 * at commit time(->commit_txn) as a whole
	 */
1092
	if (cpuhw->group_flag & PERF_EVENT_TXN)
1093 1094
		goto nocheck;

1095
	if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
1096
		goto out;
1097
	if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
1098
		goto out;
1099
	event->hw.config = cpuhw->events[n0];
1100 1101

nocheck:
1102
	++cpuhw->n_events;
1103 1104 1105 1106
	++cpuhw->n_added;

	ret = 0;
 out:
1107 1108 1109
	if (has_branch_stack(event))
		power_pmu_bhrb_enable(event);

P
Peter Zijlstra 已提交
1110
	perf_pmu_enable(event->pmu);
1111 1112 1113 1114 1115
	local_irq_restore(flags);
	return ret;
}

/*
1116
 * Remove a event from the PMU.
1117
 */
P
Peter Zijlstra 已提交
1118
static void power_pmu_del(struct perf_event *event, int ef_flags)
1119
{
1120
	struct cpu_hw_events *cpuhw;
1121 1122 1123 1124
	long i;
	unsigned long flags;

	local_irq_save(flags);
P
Peter Zijlstra 已提交
1125
	perf_pmu_disable(event->pmu);
1126

1127 1128 1129 1130 1131
	power_pmu_read(event);

	cpuhw = &__get_cpu_var(cpu_hw_events);
	for (i = 0; i < cpuhw->n_events; ++i) {
		if (event == cpuhw->event[i]) {
1132
			while (++i < cpuhw->n_events) {
1133
				cpuhw->event[i-1] = cpuhw->event[i];
1134 1135 1136
				cpuhw->events[i-1] = cpuhw->events[i];
				cpuhw->flags[i-1] = cpuhw->flags[i];
			}
1137 1138 1139 1140 1141
			--cpuhw->n_events;
			ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr);
			if (event->hw.idx) {
				write_pmc(event->hw.idx, 0);
				event->hw.idx = 0;
1142
			}
1143
			perf_event_update_userpage(event);
1144 1145 1146
			break;
		}
	}
1147
	for (i = 0; i < cpuhw->n_limited; ++i)
1148
		if (event == cpuhw->limited_counter[i])
1149 1150 1151
			break;
	if (i < cpuhw->n_limited) {
		while (++i < cpuhw->n_limited) {
1152
			cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
1153 1154 1155 1156
			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
		}
		--cpuhw->n_limited;
	}
1157 1158
	if (cpuhw->n_events == 0) {
		/* disable exceptions if no events are running */
1159 1160 1161
		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
	}

1162 1163 1164
	if (has_branch_stack(event))
		power_pmu_bhrb_disable(event);

P
Peter Zijlstra 已提交
1165
	perf_pmu_enable(event->pmu);
1166 1167 1168
	local_irq_restore(flags);
}

1169
/*
P
Peter Zijlstra 已提交
1170 1171
 * POWER-PMU does not support disabling individual counters, hence
 * program their cycle counter to their max value and ignore the interrupts.
1172
 */
P
Peter Zijlstra 已提交
1173 1174

static void power_pmu_start(struct perf_event *event, int ef_flags)
1175 1176
{
	unsigned long flags;
P
Peter Zijlstra 已提交
1177
	s64 left;
1178
	unsigned long val;
1179

1180
	if (!event->hw.idx || !event->hw.sample_period)
1181
		return;
P
Peter Zijlstra 已提交
1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193

	if (!(event->hw.state & PERF_HES_STOPPED))
		return;

	if (ef_flags & PERF_EF_RELOAD)
		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));

	local_irq_save(flags);
	perf_pmu_disable(event->pmu);

	event->hw.state = 0;
	left = local64_read(&event->hw.period_left);
1194 1195 1196 1197 1198 1199

	val = 0;
	if (left < 0x80000000L)
		val = 0x80000000L - left;

	write_pmc(event->hw.idx, val);
P
Peter Zijlstra 已提交
1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215

	perf_event_update_userpage(event);
	perf_pmu_enable(event->pmu);
	local_irq_restore(flags);
}

static void power_pmu_stop(struct perf_event *event, int ef_flags)
{
	unsigned long flags;

	if (!event->hw.idx || !event->hw.sample_period)
		return;

	if (event->hw.state & PERF_HES_STOPPED)
		return;

1216
	local_irq_save(flags);
P
Peter Zijlstra 已提交
1217
	perf_pmu_disable(event->pmu);
P
Peter Zijlstra 已提交
1218

1219
	power_pmu_read(event);
P
Peter Zijlstra 已提交
1220 1221 1222
	event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
	write_pmc(event->hw.idx, 0);

1223
	perf_event_update_userpage(event);
P
Peter Zijlstra 已提交
1224
	perf_pmu_enable(event->pmu);
1225 1226 1227
	local_irq_restore(flags);
}

1228 1229 1230 1231 1232
/*
 * Start group events scheduling transaction
 * Set the flag to make pmu::enable() not perform the
 * schedulability test, it will be performed at commit time
 */
P
Peter Zijlstra 已提交
1233
void power_pmu_start_txn(struct pmu *pmu)
1234 1235 1236
{
	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);

P
Peter Zijlstra 已提交
1237
	perf_pmu_disable(pmu);
1238
	cpuhw->group_flag |= PERF_EVENT_TXN;
1239 1240 1241 1242 1243 1244 1245 1246
	cpuhw->n_txn_start = cpuhw->n_events;
}

/*
 * Stop group events scheduling transaction
 * Clear the flag and pmu::enable() will perform the
 * schedulability test.
 */
P
Peter Zijlstra 已提交
1247
void power_pmu_cancel_txn(struct pmu *pmu)
1248 1249 1250
{
	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);

1251
	cpuhw->group_flag &= ~PERF_EVENT_TXN;
P
Peter Zijlstra 已提交
1252
	perf_pmu_enable(pmu);
1253 1254 1255 1256 1257 1258 1259
}

/*
 * Commit group events scheduling transaction
 * Perform the group schedulability test as a whole
 * Return 0 if success
 */
P
Peter Zijlstra 已提交
1260
int power_pmu_commit_txn(struct pmu *pmu)
1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277
{
	struct cpu_hw_events *cpuhw;
	long i, n;

	if (!ppmu)
		return -EAGAIN;
	cpuhw = &__get_cpu_var(cpu_hw_events);
	n = cpuhw->n_events;
	if (check_excludes(cpuhw->event, cpuhw->flags, 0, n))
		return -EAGAIN;
	i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n);
	if (i < 0)
		return -EAGAIN;

	for (i = cpuhw->n_txn_start; i < n; ++i)
		cpuhw->event[i]->hw.config = cpuhw->events[i];

1278
	cpuhw->group_flag &= ~PERF_EVENT_TXN;
P
Peter Zijlstra 已提交
1279
	perf_pmu_enable(pmu);
1280 1281 1282
	return 0;
}

1283
/*
1284
 * Return 1 if we might be able to put event on a limited PMC,
1285
 * or 0 if not.
1286
 * A event can only go on a limited PMC if it counts something
1287 1288 1289
 * that a limited PMC can count, doesn't require interrupts, and
 * doesn't exclude any processor mode.
 */
1290
static int can_go_on_limited_pmc(struct perf_event *event, u64 ev,
1291 1292 1293
				 unsigned int flags)
{
	int n;
1294
	u64 alt[MAX_EVENT_ALTERNATIVES];
1295

1296 1297 1298 1299
	if (event->attr.exclude_user
	    || event->attr.exclude_kernel
	    || event->attr.exclude_hv
	    || event->attr.sample_period)
1300 1301 1302 1303 1304 1305
		return 0;

	if (ppmu->limited_pmc_event(ev))
		return 1;

	/*
1306
	 * The requested event_id isn't on a limited PMC already;
1307 1308 1309 1310 1311 1312 1313 1314
	 * see if any alternative code goes on a limited PMC.
	 */
	if (!ppmu->get_alternatives)
		return 0;

	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
	n = ppmu->get_alternatives(ev, flags, alt);

1315
	return n > 0;
1316 1317 1318
}

/*
1319 1320 1321
 * Find an alternative event_id that goes on a normal PMC, if possible,
 * and return the event_id code, or 0 if there is no such alternative.
 * (Note: event_id code 0 is "don't count" on all machines.)
1322
 */
1323
static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
1324
{
1325
	u64 alt[MAX_EVENT_ALTERNATIVES];
1326 1327 1328 1329 1330 1331 1332 1333 1334
	int n;

	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
	n = ppmu->get_alternatives(ev, flags, alt);
	if (!n)
		return 0;
	return alt[0];
}

1335 1336
/* Number of perf_events counting hardware events */
static atomic_t num_events;
1337 1338 1339 1340
/* Used to avoid races in calling reserve/release_pmc_hardware */
static DEFINE_MUTEX(pmc_reserve_mutex);

/*
1341
 * Release the PMU if this is the last perf_event.
1342
 */
1343
static void hw_perf_event_destroy(struct perf_event *event)
1344
{
1345
	if (!atomic_add_unless(&num_events, -1, 1)) {
1346
		mutex_lock(&pmc_reserve_mutex);
1347
		if (atomic_dec_return(&num_events) == 0)
1348 1349 1350 1351 1352
			release_pmc_hardware();
		mutex_unlock(&pmc_reserve_mutex);
	}
}

1353
/*
1354
 * Translate a generic cache event_id config to a raw event_id code.
1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382
 */
static int hw_perf_cache_event(u64 config, u64 *eventp)
{
	unsigned long type, op, result;
	int ev;

	if (!ppmu->cache_events)
		return -EINVAL;

	/* unpack config */
	type = config & 0xff;
	op = (config >> 8) & 0xff;
	result = (config >> 16) & 0xff;

	if (type >= PERF_COUNT_HW_CACHE_MAX ||
	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	ev = (*ppmu->cache_events)[type][op][result];
	if (ev == 0)
		return -EOPNOTSUPP;
	if (ev == -1)
		return -EINVAL;
	*eventp = ev;
	return 0;
}

1383
static int power_pmu_event_init(struct perf_event *event)
1384
{
1385 1386
	u64 ev;
	unsigned long flags;
1387 1388 1389
	struct perf_event *ctrs[MAX_HWEVENTS];
	u64 events[MAX_HWEVENTS];
	unsigned int cflags[MAX_HWEVENTS];
1390
	int n;
1391
	int err;
1392
	struct cpu_hw_events *cpuhw;
1393 1394

	if (!ppmu)
1395 1396
		return -ENOENT;

1397 1398 1399 1400 1401
	if (has_branch_stack(event)) {
	        /* PMU has BHRB enabled */
		if (!(ppmu->flags & PPMU_BHRB))
			return -EOPNOTSUPP;
	}
1402

1403
	switch (event->attr.type) {
1404
	case PERF_TYPE_HARDWARE:
1405
		ev = event->attr.config;
1406
		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
1407
			return -EOPNOTSUPP;
1408
		ev = ppmu->generic_events[ev];
1409 1410
		break;
	case PERF_TYPE_HW_CACHE:
1411
		err = hw_perf_cache_event(event->attr.config, &ev);
1412
		if (err)
1413
			return err;
1414 1415
		break;
	case PERF_TYPE_RAW:
1416
		ev = event->attr.config;
1417
		break;
1418
	default:
1419
		return -ENOENT;
1420
	}
1421

1422 1423
	event->hw.config_base = ev;
	event->hw.idx = 0;
1424

1425 1426 1427
	/*
	 * If we are not running on a hypervisor, force the
	 * exclude_hv bit to 0 so that we don't care what
1428
	 * the user set it to.
1429 1430
	 */
	if (!firmware_has_feature(FW_FEATURE_LPAR))
1431
		event->attr.exclude_hv = 0;
1432 1433

	/*
1434
	 * If this is a per-task event, then we can use
1435 1436 1437 1438 1439
	 * PM_RUN_* events interchangeably with their non RUN_*
	 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
	 * XXX we should check if the task is an idle task.
	 */
	flags = 0;
1440
	if (event->attach_state & PERF_ATTACH_TASK)
1441 1442 1443
		flags |= PPMU_ONLY_COUNT_RUN;

	/*
1444 1445
	 * If this machine has limited events, check whether this
	 * event_id could go on a limited event.
1446
	 */
1447
	if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
1448
		if (can_go_on_limited_pmc(event, ev, flags)) {
1449 1450 1451
			flags |= PPMU_LIMITED_PMC_OK;
		} else if (ppmu->limited_pmc_event(ev)) {
			/*
1452
			 * The requested event_id is on a limited PMC,
1453 1454 1455 1456 1457
			 * but we can't use a limited PMC; see if any
			 * alternative goes on a normal PMC.
			 */
			ev = normal_pmc_alternative(ev, flags);
			if (!ev)
1458
				return -EINVAL;
1459 1460 1461
		}
	}

1462 1463
	/*
	 * If this is in a group, check if it can go on with all the
1464
	 * other hardware events in the group.  We assume the event
1465 1466 1467
	 * hasn't been linked into its leader's sibling list at this point.
	 */
	n = 0;
1468
	if (event->group_leader != event) {
1469
		n = collect_events(event->group_leader, ppmu->n_counter - 1,
1470
				   ctrs, events, cflags);
1471
		if (n < 0)
1472
			return -EINVAL;
1473
	}
1474
	events[n] = ev;
1475
	ctrs[n] = event;
1476 1477
	cflags[n] = flags;
	if (check_excludes(ctrs, cflags, n, 1))
1478
		return -EINVAL;
1479

1480
	cpuhw = &get_cpu_var(cpu_hw_events);
1481
	err = power_check_constraints(cpuhw, events, cflags, n + 1);
1482 1483 1484 1485 1486 1487 1488 1489 1490

	if (has_branch_stack(event)) {
		cpuhw->bhrb_filter = ppmu->bhrb_filter_map(
					event->attr.branch_sample_type);

		if(cpuhw->bhrb_filter == -1)
			return -EOPNOTSUPP;
	}

1491
	put_cpu_var(cpu_hw_events);
1492
	if (err)
1493
		return -EINVAL;
1494

1495 1496 1497
	event->hw.config = events[n];
	event->hw.event_base = cflags[n];
	event->hw.last_period = event->hw.sample_period;
1498
	local64_set(&event->hw.period_left, event->hw.last_period);
1499 1500 1501

	/*
	 * See if we need to reserve the PMU.
1502
	 * If no events are currently in use, then we have to take a
1503 1504 1505 1506
	 * mutex to ensure that we don't race with another task doing
	 * reserve_pmc_hardware or release_pmc_hardware.
	 */
	err = 0;
1507
	if (!atomic_inc_not_zero(&num_events)) {
1508
		mutex_lock(&pmc_reserve_mutex);
1509 1510
		if (atomic_read(&num_events) == 0 &&
		    reserve_pmc_hardware(perf_event_interrupt))
1511 1512
			err = -EBUSY;
		else
1513
			atomic_inc(&num_events);
1514 1515
		mutex_unlock(&pmc_reserve_mutex);
	}
1516
	event->destroy = hw_perf_event_destroy;
1517

1518
	return err;
1519 1520
}

1521 1522 1523 1524 1525
static int power_pmu_event_idx(struct perf_event *event)
{
	return event->hw.idx;
}

1526 1527 1528 1529 1530 1531 1532 1533 1534 1535
ssize_t power_events_sysfs_show(struct device *dev,
				struct device_attribute *attr, char *page)
{
	struct perf_pmu_events_attr *pmu_attr;

	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);

	return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
}

1536
struct pmu power_pmu = {
P
Peter Zijlstra 已提交
1537 1538
	.pmu_enable	= power_pmu_enable,
	.pmu_disable	= power_pmu_disable,
1539
	.event_init	= power_pmu_event_init,
P
Peter Zijlstra 已提交
1540 1541 1542 1543
	.add		= power_pmu_add,
	.del		= power_pmu_del,
	.start		= power_pmu_start,
	.stop		= power_pmu_stop,
1544 1545 1546 1547
	.read		= power_pmu_read,
	.start_txn	= power_pmu_start_txn,
	.cancel_txn	= power_pmu_cancel_txn,
	.commit_txn	= power_pmu_commit_txn,
1548
	.event_idx	= power_pmu_event_idx,
1549
	.flush_branch_stack = power_pmu_flush_branch_stack,
1550 1551
};

1552
/*
I
Ingo Molnar 已提交
1553
 * A counter has overflowed; update its count and record
1554 1555 1556
 * things if requested.  Note that interrupts are hard-disabled
 * here so there is no possibility of being interrupted.
 */
1557
static void record_and_restart(struct perf_event *event, unsigned long val,
1558
			       struct pt_regs *regs)
1559
{
1560
	u64 period = event->hw.sample_period;
1561 1562 1563
	s64 prev, delta, left;
	int record = 0;

P
Peter Zijlstra 已提交
1564 1565 1566 1567 1568
	if (event->hw.state & PERF_HES_STOPPED) {
		write_pmc(event->hw.idx, 0);
		return;
	}

1569
	/* we don't have to worry about interrupts here */
1570
	prev = local64_read(&event->hw.prev_count);
1571
	delta = check_and_compute_delta(prev, val);
1572
	local64_add(delta, &event->count);
1573 1574

	/*
1575
	 * See if the total period for this event has expired,
1576 1577 1578
	 * and update for the next period.
	 */
	val = 0;
1579
	left = local64_read(&event->hw.period_left) - delta;
1580 1581
	if (delta == 0)
		left++;
1582
	if (period) {
1583
		if (left <= 0) {
1584
			left += period;
1585
			if (left <= 0)
1586
				left = period;
1587
			record = siar_valid(regs);
1588
			event->hw.last_period = event->hw.sample_period;
1589
		}
1590 1591
		if (left < 0x80000000LL)
			val = 0x80000000LL - left;
1592 1593
	}

P
Peter Zijlstra 已提交
1594 1595 1596 1597 1598
	write_pmc(event->hw.idx, val);
	local64_set(&event->hw.prev_count, val);
	local64_set(&event->hw.period_left, left);
	perf_event_update_userpage(event);

1599 1600 1601
	/*
	 * Finally record data if requested.
	 */
1602
	if (record) {
1603 1604
		struct perf_sample_data data;

1605
		perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
1606

1607
		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
1608 1609
			perf_get_data_addr(regs, &data.addr);

1610 1611 1612 1613 1614 1615 1616
		if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {
			struct cpu_hw_events *cpuhw;
			cpuhw = &__get_cpu_var(cpu_hw_events);
			power_pmu_bhrb_read(cpuhw);
			data.br_stack = &cpuhw->bhrb_stack;
		}

1617
		if (perf_event_overflow(event, &data, regs))
P
Peter Zijlstra 已提交
1618
			power_pmu_stop(event, 0);
1619 1620 1621 1622 1623
	}
}

/*
 * Called from generic code to get the misc flags (i.e. processor mode)
1624
 * for an event_id.
1625 1626 1627
 */
unsigned long perf_misc_flags(struct pt_regs *regs)
{
1628
	u32 flags = perf_get_misc_flags(regs);
1629

1630 1631
	if (flags)
		return flags;
1632 1633
	return user_mode(regs) ? PERF_RECORD_MISC_USER :
		PERF_RECORD_MISC_KERNEL;
1634 1635 1636 1637
}

/*
 * Called from generic code to get the instruction pointer
1638
 * for an event_id.
1639 1640 1641
 */
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
1642
	bool use_siar = regs_use_siar(regs);
1643

1644
	if (use_siar && siar_valid(regs))
1645
		return mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
1646 1647
	else if (use_siar)
		return 0;		// no valid instruction pointer
1648
	else
1649
		return regs->nip;
1650 1651
}

1652
static bool pmc_overflow_power7(unsigned long val)
1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664
{
	/*
	 * Events on POWER7 can roll back if a speculative event doesn't
	 * eventually complete. Unfortunately in some rare cases they will
	 * raise a performance monitor exception. We need to catch this to
	 * ensure we reset the PMC. In all cases the PMC will be 256 or less
	 * cycles from overflow.
	 *
	 * We only do this if the first pass fails to find any overflowing
	 * PMCs because a user might set a period of less than 256 and we
	 * don't want to mistakenly reset them.
	 */
1665 1666 1667 1668 1669 1670 1671 1672 1673
	if ((0x80000000 - val) <= 256)
		return true;

	return false;
}

static bool pmc_overflow(unsigned long val)
{
	if ((int)val < 0)
1674 1675 1676 1677 1678
		return true;

	return false;
}

1679 1680 1681
/*
 * Performance monitor interrupt stuff
 */
1682
static void perf_event_interrupt(struct pt_regs *regs)
1683
{
1684
	int i, j;
1685 1686
	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
	struct perf_event *event;
1687 1688
	unsigned long val[8];
	int found, active;
1689 1690
	int nmi;

1691
	if (cpuhw->n_limited)
1692
		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
1693 1694
					mfspr(SPRN_PMC6));

1695
	perf_read_regs(regs);
1696

1697
	nmi = perf_intr_is_nmi(regs);
1698 1699 1700 1701
	if (nmi)
		nmi_enter();
	else
		irq_enter();
1702

1703 1704 1705 1706 1707 1708 1709 1710
	/* Read all the PMCs since we'll need them a bunch of times */
	for (i = 0; i < ppmu->n_counter; ++i)
		val[i] = read_pmc(i + 1);

	/* Try to find what caused the IRQ */
	found = 0;
	for (i = 0; i < ppmu->n_counter; ++i) {
		if (!pmc_overflow(val[i]))
1711
			continue;
1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727
		if (is_limited_pmc(i + 1))
			continue; /* these won't generate IRQs */
		/*
		 * We've found one that's overflowed.  For active
		 * counters we need to log this.  For inactive
		 * counters, we need to reset it anyway
		 */
		found = 1;
		active = 0;
		for (j = 0; j < cpuhw->n_events; ++j) {
			event = cpuhw->event[j];
			if (event->hw.idx == (i + 1)) {
				active = 1;
				record_and_restart(event, val[i], regs);
				break;
			}
1728
		}
1729 1730 1731
		if (!active)
			/* reset non active counters that have overflowed */
			write_pmc(i + 1, 0);
1732
	}
1733 1734 1735 1736 1737
	if (!found && pvr_version_is(PVR_POWER7)) {
		/* check active counters for special buggy p7 overflow */
		for (i = 0; i < cpuhw->n_events; ++i) {
			event = cpuhw->event[i];
			if (!event->hw.idx || is_limited_pmc(event->hw.idx))
1738
				continue;
1739 1740 1741 1742 1743 1744 1745
			if (pmc_overflow_power7(val[event->hw.idx - 1])) {
				/* event has overflowed in a buggy way*/
				found = 1;
				record_and_restart(event,
						   val[event->hw.idx - 1],
						   regs);
			}
1746 1747
		}
	}
1748 1749
	if ((!found) && printk_ratelimit())
		printk(KERN_WARNING "Can't find PMC that caused IRQ\n");
1750 1751 1752

	/*
	 * Reset MMCR0 to its normal value.  This will set PMXE and
I
Ingo Molnar 已提交
1753
	 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
1754
	 * and thus allow interrupts to occur again.
1755
	 * XXX might want to use MSR.PM to keep the events frozen until
1756 1757
	 * we get back out of this interrupt.
	 */
1758
	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1759

1760 1761 1762
	if (nmi)
		nmi_exit();
	else
1763
		irq_exit();
1764 1765
}

1766
static void power_pmu_setup(int cpu)
1767
{
1768
	struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
1769

1770 1771
	if (!ppmu)
		return;
1772 1773 1774 1775
	memset(cpuhw, 0, sizeof(*cpuhw));
	cpuhw->mmcr[0] = MMCR0_FC;
}

1776
static int __cpuinit
1777
power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792
{
	unsigned int cpu = (long)hcpu;

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
		power_pmu_setup(cpu);
		break;

	default:
		break;
	}

	return NOTIFY_OK;
}

1793
int __cpuinit register_power_pmu(struct power_pmu *pmu)
1794
{
1795 1796 1797 1798 1799 1800
	if (ppmu)
		return -EBUSY;		/* something's already registered */

	ppmu = pmu;
	pr_info("%s performance monitor hardware support registered\n",
		pmu->name);
1801

1802 1803
	power_pmu.attr_groups = ppmu->attr_groups;

1804
#ifdef MSR_HV
1805 1806 1807 1808
	/*
	 * Use FCHV to ignore kernel events if MSR.HV is set.
	 */
	if (mfmsr() & MSR_HV)
1809
		freeze_events_kernel = MMCR0_FCHV;
1810
#endif /* CONFIG_PPC64 */
1811

P
Peter Zijlstra 已提交
1812
	perf_pmu_register(&power_pmu, "cpu", PERF_TYPE_RAW);
1813 1814
	perf_cpu_notifier(power_pmu_notifier);

1815 1816
	return 0;
}