core-book3s.c 44.4 KB
Newer Older
1
/*
2
 * Performance event support - powerpc architecture code
3 4 5 6 7 8 9 10 11 12
 *
 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */
#include <linux/kernel.h>
#include <linux/sched.h>
13
#include <linux/perf_event.h>
14 15
#include <linux/percpu.h>
#include <linux/hardirq.h>
16
#include <linux/uaccess.h>
17 18
#include <asm/reg.h>
#include <asm/pmc.h>
19
#include <asm/machdep.h>
20
#include <asm/firmware.h>
21
#include <asm/ptrace.h>
22
#include <asm/code-patching.h>
23

24 25 26 27 28
#define BHRB_MAX_ENTRIES	32
#define BHRB_TARGET		0x0000000000000002
#define BHRB_PREDICTION		0x0000000000000001
#define BHRB_EA			0xFFFFFFFFFFFFFFFC

29 30
struct cpu_hw_events {
	int n_events;
31 32 33
	int n_percpu;
	int disabled;
	int n_added;
34 35
	int n_limited;
	u8  pmcs_enabled;
36 37 38
	struct perf_event *event[MAX_HWEVENTS];
	u64 events[MAX_HWEVENTS];
	unsigned int flags[MAX_HWEVENTS];
39
	unsigned long mmcr[3];
40 41
	struct perf_event *limited_counter[MAX_LIMITED_HWCOUNTERS];
	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
42 43 44
	u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
	unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
	unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
45 46 47

	unsigned int group_flag;
	int n_txn_start;
48 49 50 51 52 53 54

	/* BHRB bits */
	u64				bhrb_filter;	/* BHRB HW branch filter */
	int				bhrb_users;
	void				*bhrb_context;
	struct	perf_branch_stack	bhrb_stack;
	struct	perf_branch_entry	bhrb_entries[BHRB_MAX_ENTRIES];
55
};
56

57
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
58 59 60

struct power_pmu *ppmu;

61
/*
I
Ingo Molnar 已提交
62
 * Normally, to ignore kernel events we set the FCS (freeze counters
63 64 65 66 67
 * in supervisor mode) bit in MMCR0, but if the kernel runs with the
 * hypervisor bit set in the MSR, or if we are running on a processor
 * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
 * then we need to use the FCHV bit to ignore kernel events.
 */
68
static unsigned int freeze_events_kernel = MMCR0_FCS;
69

70 71 72 73 74 75 76 77
/*
 * 32-bit doesn't have MMCRA but does have an MMCR2,
 * and a few other names are different.
 */
#ifdef CONFIG_PPC32

#define MMCR0_FCHV		0
#define MMCR0_PMCjCE		MMCR0_PMCnCE
78
#define MMCR0_FC56		0
79
#define MMCR0_PMAO		0
80 81 82 83 84 85 86 87 88 89 90 91 92

#define SPRN_MMCRA		SPRN_MMCR2
#define MMCRA_SAMPLE_ENABLE	0

static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
{
	return 0;
}
static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
static inline u32 perf_get_misc_flags(struct pt_regs *regs)
{
	return 0;
}
93 94 95 96
static inline void perf_read_regs(struct pt_regs *regs)
{
	regs->result = 0;
}
97 98 99 100 101
static inline int perf_intr_is_nmi(struct pt_regs *regs)
{
	return 0;
}

102 103 104 105 106
static inline int siar_valid(struct pt_regs *regs)
{
	return 1;
}

107 108 109 110
static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
void power_pmu_flush_branch_stack(void) {}
static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {}
111 112
#endif /* CONFIG_PPC32 */

113 114
static bool regs_use_siar(struct pt_regs *regs)
{
115
	return !!regs->result;
116 117
}

118 119 120 121 122 123 124 125 126
/*
 * Things that are specific to 64-bit implementations.
 */
#ifdef CONFIG_PPC64

static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
{
	unsigned long mmcra = regs->dsisr;

127
	if ((ppmu->flags & PPMU_HAS_SSLOT) && (mmcra & MMCRA_SAMPLE_ENABLE)) {
128 129 130 131
		unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
		if (slot > 1)
			return 4 * (slot - 1);
	}
132

133 134 135 136 137 138 139 140
	return 0;
}

/*
 * The user wants a data address recorded.
 * If we're not doing instruction sampling, give them the SDAR
 * (sampled data address).  If we are doing instruction sampling, then
 * only give them the SDAR if it corresponds to the instruction
141 142
 * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC, the
 * [POWER7P_]MMCRA_SDAR_VALID bit in MMCRA, or the SDAR_VALID bit in SIER.
143 144 145 146
 */
static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
{
	unsigned long mmcra = regs->dsisr;
147
	bool sdar_valid;
148

149 150 151 152 153 154 155 156 157 158 159 160 161 162
	if (ppmu->flags & PPMU_HAS_SIER)
		sdar_valid = regs->dar & SIER_SDAR_VALID;
	else {
		unsigned long sdsync;

		if (ppmu->flags & PPMU_SIAR_VALID)
			sdsync = POWER7P_MMCRA_SDAR_VALID;
		else if (ppmu->flags & PPMU_ALT_SIPR)
			sdsync = POWER6_MMCRA_SDSYNC;
		else
			sdsync = MMCRA_SDSYNC;

		sdar_valid = mmcra & sdsync;
	}
163

164
	if (!(mmcra & MMCRA_SAMPLE_ENABLE) || sdar_valid)
165 166 167
		*addrp = mfspr(SPRN_SDAR);
}

168
static bool regs_sihv(struct pt_regs *regs)
169 170 171
{
	unsigned long sihv = MMCRA_SIHV;

172 173 174
	if (ppmu->flags & PPMU_HAS_SIER)
		return !!(regs->dar & SIER_SIHV);

175 176 177
	if (ppmu->flags & PPMU_ALT_SIPR)
		sihv = POWER6_MMCRA_SIHV;

178
	return !!(regs->dsisr & sihv);
179 180
}

181
static bool regs_sipr(struct pt_regs *regs)
182 183 184
{
	unsigned long sipr = MMCRA_SIPR;

185 186 187
	if (ppmu->flags & PPMU_HAS_SIER)
		return !!(regs->dar & SIER_SIPR);

188 189 190
	if (ppmu->flags & PPMU_ALT_SIPR)
		sipr = POWER6_MMCRA_SIPR;

191
	return !!(regs->dsisr & sipr);
192 193
}

194 195 196 197 198 199 200 201 202
static inline u32 perf_flags_from_msr(struct pt_regs *regs)
{
	if (regs->msr & MSR_PR)
		return PERF_RECORD_MISC_USER;
	if ((regs->msr & MSR_HV) && freeze_events_kernel != MMCR0_FCHV)
		return PERF_RECORD_MISC_HYPERVISOR;
	return PERF_RECORD_MISC_KERNEL;
}

203 204
static inline u32 perf_get_misc_flags(struct pt_regs *regs)
{
205
	bool use_siar = regs_use_siar(regs);
206

207
	if (!use_siar)
208 209 210 211 212 213 214 215
		return perf_flags_from_msr(regs);

	/*
	 * If we don't have flags in MMCRA, rather than using
	 * the MSR, we intuit the flags from the address in
	 * SIAR which should give slightly more reliable
	 * results
	 */
216
	if (ppmu->flags & PPMU_NO_SIPR) {
217 218 219 220 221
		unsigned long siar = mfspr(SPRN_SIAR);
		if (siar >= PAGE_OFFSET)
			return PERF_RECORD_MISC_KERNEL;
		return PERF_RECORD_MISC_USER;
	}
222

223
	/* PR has priority over HV, so order below is important */
224
	if (regs_sipr(regs))
225
		return PERF_RECORD_MISC_USER;
226 227

	if (regs_sihv(regs) && (freeze_events_kernel != MMCR0_FCHV))
228
		return PERF_RECORD_MISC_HYPERVISOR;
229

230
	return PERF_RECORD_MISC_KERNEL;
231 232 233 234 235
}

/*
 * Overload regs->dsisr to store MMCRA so we only need to read it once
 * on each interrupt.
236
 * Overload regs->dar to store SIER if we have it.
237 238
 * Overload regs->result to specify whether we should use the MSR (result
 * is zero) or the SIAR (result is non zero).
239 240 241
 */
static inline void perf_read_regs(struct pt_regs *regs)
{
242 243 244 245
	unsigned long mmcra = mfspr(SPRN_MMCRA);
	int marked = mmcra & MMCRA_SAMPLE_ENABLE;
	int use_siar;

246
	regs->dsisr = mmcra;
247

248 249
	if (ppmu->flags & PPMU_HAS_SIER)
		regs->dar = mfspr(SPRN_SIER);
250

251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
	/*
	 * If this isn't a PMU exception (eg a software event) the SIAR is
	 * not valid. Use pt_regs.
	 *
	 * If it is a marked event use the SIAR.
	 *
	 * If the PMU doesn't update the SIAR for non marked events use
	 * pt_regs.
	 *
	 * If the PMU has HV/PR flags then check to see if they
	 * place the exception in userspace. If so, use pt_regs. In
	 * continuous sampling mode the SIAR and the PMU exception are
	 * not synchronised, so they may be many instructions apart.
	 * This can result in confusing backtraces. We still want
	 * hypervisor samples as well as samples in the kernel with
	 * interrupts off hence the userspace check.
	 */
268 269
	if (TRAP(regs) != 0xf00)
		use_siar = 0;
270 271 272 273
	else if (marked)
		use_siar = 1;
	else if ((ppmu->flags & PPMU_NO_CONT_SAMPLING))
		use_siar = 0;
274
	else if (!(ppmu->flags & PPMU_NO_SIPR) && regs_sipr(regs))
275 276 277 278
		use_siar = 0;
	else
		use_siar = 1;

279
	regs->result = use_siar;
280 281 282 283 284 285 286 287 288 289 290
}

/*
 * If interrupts were soft-disabled when a PMU interrupt occurs, treat
 * it as an NMI.
 */
static inline int perf_intr_is_nmi(struct pt_regs *regs)
{
	return !regs->softe;
}

291 292 293 294 295 296 297 298 299 300 301 302
/*
 * On processors like P7+ that have the SIAR-Valid bit, marked instructions
 * must be sampled only if the SIAR-valid bit is set.
 *
 * For unmarked instructions and for processors that don't have the SIAR-Valid
 * bit, assume that SIAR is valid.
 */
static inline int siar_valid(struct pt_regs *regs)
{
	unsigned long mmcra = regs->dsisr;
	int marked = mmcra & MMCRA_SAMPLE_ENABLE;

303 304 305 306 307 308 309
	if (marked) {
		if (ppmu->flags & PPMU_HAS_SIER)
			return regs->dar & SIER_SIAR_VALID;

		if (ppmu->flags & PPMU_SIAR_VALID)
			return mmcra & POWER7P_MMCRA_SIAR_VALID;
	}
310 311 312 313

	return 1;
}

314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363

/* Reset all possible BHRB entries */
static void power_pmu_bhrb_reset(void)
{
	asm volatile(PPC_CLRBHRB);
}

static void power_pmu_bhrb_enable(struct perf_event *event)
{
	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);

	if (!ppmu->bhrb_nr)
		return;

	/* Clear BHRB if we changed task context to avoid data leaks */
	if (event->ctx->task && cpuhw->bhrb_context != event->ctx) {
		power_pmu_bhrb_reset();
		cpuhw->bhrb_context = event->ctx;
	}
	cpuhw->bhrb_users++;
}

static void power_pmu_bhrb_disable(struct perf_event *event)
{
	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);

	if (!ppmu->bhrb_nr)
		return;

	cpuhw->bhrb_users--;
	WARN_ON_ONCE(cpuhw->bhrb_users < 0);

	if (!cpuhw->disabled && !cpuhw->bhrb_users) {
		/* BHRB cannot be turned off when other
		 * events are active on the PMU.
		 */

		/* avoid stale pointer */
		cpuhw->bhrb_context = NULL;
	}
}

/* Called from ctxsw to prevent one process's branch entries to
 * mingle with the other process's entries during context switch.
 */
void power_pmu_flush_branch_stack(void)
{
	if (ppmu->bhrb_nr)
		power_pmu_bhrb_reset();
}
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
/* Calculate the to address for a branch */
static __u64 power_pmu_bhrb_to(u64 addr)
{
	unsigned int instr;
	int ret;
	__u64 target;

	if (is_kernel_addr(addr))
		return branch_target((unsigned int *)addr);

	/* Userspace: need copy instruction here then translate it */
	pagefault_disable();
	ret = __get_user_inatomic(instr, (unsigned int __user *)addr);
	if (ret) {
		pagefault_enable();
		return 0;
	}
	pagefault_enable();

	target = branch_target(&instr);
	if ((!target) || (instr & BRANCH_ABSOLUTE))
		return target;

	/* Translate relative branch target from kernel to user address */
	return target - (unsigned long)&instr + addr;
}
390 391

/* Processing BHRB entries */
392
void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw)
393 394 395
{
	u64 val;
	u64 addr;
396
	int r_index, u_index, pred;
397 398 399 400 401

	r_index = 0;
	u_index = 0;
	while (r_index < ppmu->bhrb_nr) {
		/* Assembly read function */
402 403 404
		val = read_bhrb(r_index++);
		if (!val)
			/* Terminal marker: End of valid BHRB entries */
405
			break;
406
		else {
407 408 409
			addr = val & BHRB_EA;
			pred = val & BHRB_PREDICTION;

410 411
			if (!addr)
				/* invalid entry */
412 413
				continue;

414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
			/* Branches are read most recent first (ie. mfbhrb 0 is
			 * the most recent branch).
			 * There are two types of valid entries:
			 * 1) a target entry which is the to address of a
			 *    computed goto like a blr,bctr,btar.  The next
			 *    entry read from the bhrb will be branch
			 *    corresponding to this target (ie. the actual
			 *    blr/bctr/btar instruction).
			 * 2) a from address which is an actual branch.  If a
			 *    target entry proceeds this, then this is the
			 *    matching branch for that target.  If this is not
			 *    following a target entry, then this is a branch
			 *    where the target is given as an immediate field
			 *    in the instruction (ie. an i or b form branch).
			 *    In this case we need to read the instruction from
			 *    memory to determine the target/to address.
			 */
431 432

			if (val & BHRB_TARGET) {
433 434 435 436 437 438
				/* Target branches use two entries
				 * (ie. computed gotos/XL form)
				 */
				cpuhw->bhrb_entries[u_index].to = addr;
				cpuhw->bhrb_entries[u_index].mispred = pred;
				cpuhw->bhrb_entries[u_index].predicted = ~pred;
439

440 441 442 443 444 445 446 447 448 449
				/* Get from address in next entry */
				val = read_bhrb(r_index++);
				addr = val & BHRB_EA;
				if (val & BHRB_TARGET) {
					/* Shouldn't have two targets in a
					   row.. Reset index and try again */
					r_index--;
					addr = 0;
				}
				cpuhw->bhrb_entries[u_index].from = addr;
450
			} else {
451 452
				/* Branches to immediate field 
				   (ie I or B form) */
453
				cpuhw->bhrb_entries[u_index].from = addr;
454 455
				cpuhw->bhrb_entries[u_index].to =
					power_pmu_bhrb_to(addr);
456 457 458
				cpuhw->bhrb_entries[u_index].mispred = pred;
				cpuhw->bhrb_entries[u_index].predicted = ~pred;
			}
459 460
			u_index++;

461 462 463 464 465 466
		}
	}
	cpuhw->bhrb_stack.nr = u_index;
	return;
}

467 468
#endif /* CONFIG_PPC64 */

469
static void perf_event_interrupt(struct pt_regs *regs);
470

471
void perf_event_print_debug(void)
472 473 474 475
{
}

/*
I
Ingo Molnar 已提交
476
 * Read one performance monitor counter (PMC).
477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500
 */
static unsigned long read_pmc(int idx)
{
	unsigned long val;

	switch (idx) {
	case 1:
		val = mfspr(SPRN_PMC1);
		break;
	case 2:
		val = mfspr(SPRN_PMC2);
		break;
	case 3:
		val = mfspr(SPRN_PMC3);
		break;
	case 4:
		val = mfspr(SPRN_PMC4);
		break;
	case 5:
		val = mfspr(SPRN_PMC5);
		break;
	case 6:
		val = mfspr(SPRN_PMC6);
		break;
501
#ifdef CONFIG_PPC64
502 503 504 505 506 507
	case 7:
		val = mfspr(SPRN_PMC7);
		break;
	case 8:
		val = mfspr(SPRN_PMC8);
		break;
508
#endif /* CONFIG_PPC64 */
509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539
	default:
		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
		val = 0;
	}
	return val;
}

/*
 * Write one PMC.
 */
static void write_pmc(int idx, unsigned long val)
{
	switch (idx) {
	case 1:
		mtspr(SPRN_PMC1, val);
		break;
	case 2:
		mtspr(SPRN_PMC2, val);
		break;
	case 3:
		mtspr(SPRN_PMC3, val);
		break;
	case 4:
		mtspr(SPRN_PMC4, val);
		break;
	case 5:
		mtspr(SPRN_PMC5, val);
		break;
	case 6:
		mtspr(SPRN_PMC6, val);
		break;
540
#ifdef CONFIG_PPC64
541 542 543 544 545 546
	case 7:
		mtspr(SPRN_PMC7, val);
		break;
	case 8:
		mtspr(SPRN_PMC8, val);
		break;
547
#endif /* CONFIG_PPC64 */
548 549 550 551 552 553 554 555 556
	default:
		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
	}
}

/*
 * Check if a set of events can all go on the PMU at once.
 * If they can't, this will look at alternative codes for the events
 * and see if any combination of alternative codes is feasible.
557
 * The feasible set is returned in event_id[].
558
 */
559 560
static int power_check_constraints(struct cpu_hw_events *cpuhw,
				   u64 event_id[], unsigned int cflags[],
561
				   int n_ev)
562
{
563
	unsigned long mask, value, nv;
564 565
	unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS];
	int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS];
566
	int i, j;
567 568
	unsigned long addf = ppmu->add_fields;
	unsigned long tadd = ppmu->test_adder;
569

570
	if (n_ev > ppmu->n_counter)
571 572 573 574
		return -1;

	/* First see if the events will go on as-is */
	for (i = 0; i < n_ev; ++i) {
575
		if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
576 577
		    && !ppmu->limited_pmc_event(event_id[i])) {
			ppmu->get_alternatives(event_id[i], cflags[i],
578
					       cpuhw->alternatives[i]);
579
			event_id[i] = cpuhw->alternatives[i][0];
580
		}
581
		if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0],
582
					 &cpuhw->avalues[i][0]))
583 584 585 586
			return -1;
	}
	value = mask = 0;
	for (i = 0; i < n_ev; ++i) {
587 588
		nv = (value | cpuhw->avalues[i][0]) +
			(value & cpuhw->avalues[i][0] & addf);
589
		if ((((nv + tadd) ^ value) & mask) != 0 ||
590 591
		    (((nv + tadd) ^ cpuhw->avalues[i][0]) &
		     cpuhw->amasks[i][0]) != 0)
592 593
			break;
		value = nv;
594
		mask |= cpuhw->amasks[i][0];
595 596 597 598 599 600 601 602
	}
	if (i == n_ev)
		return 0;	/* all OK */

	/* doesn't work, gather alternatives... */
	if (!ppmu->get_alternatives)
		return -1;
	for (i = 0; i < n_ev; ++i) {
603
		choice[i] = 0;
604
		n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i],
605
						  cpuhw->alternatives[i]);
606
		for (j = 1; j < n_alt[i]; ++j)
607 608 609
			ppmu->get_constraint(cpuhw->alternatives[i][j],
					     &cpuhw->amasks[i][j],
					     &cpuhw->avalues[i][j]);
610 611 612 613 614 615 616 617 618 619 620 621 622 623
	}

	/* enumerate all possibilities and see if any will work */
	i = 0;
	j = -1;
	value = mask = nv = 0;
	while (i < n_ev) {
		if (j >= 0) {
			/* we're backtracking, restore context */
			value = svalues[i];
			mask = smasks[i];
			j = choice[i];
		}
		/*
624
		 * See if any alternative k for event_id i,
625 626 627
		 * where k > j, will satisfy the constraints.
		 */
		while (++j < n_alt[i]) {
628 629
			nv = (value | cpuhw->avalues[i][j]) +
				(value & cpuhw->avalues[i][j] & addf);
630
			if ((((nv + tadd) ^ value) & mask) == 0 &&
631 632
			    (((nv + tadd) ^ cpuhw->avalues[i][j])
			     & cpuhw->amasks[i][j]) == 0)
633 634 635 636 637
				break;
		}
		if (j >= n_alt[i]) {
			/*
			 * No feasible alternative, backtrack
638
			 * to event_id i-1 and continue enumerating its
639 640 641 642 643 644
			 * alternatives from where we got up to.
			 */
			if (--i < 0)
				return -1;
		} else {
			/*
645 646 647
			 * Found a feasible alternative for event_id i,
			 * remember where we got up to with this event_id,
			 * go on to the next event_id, and start with
648 649 650 651 652 653
			 * the first alternative for it.
			 */
			choice[i] = j;
			svalues[i] = value;
			smasks[i] = mask;
			value = nv;
654
			mask |= cpuhw->amasks[i][j];
655 656 657 658 659 660 661
			++i;
			j = -1;
		}
	}

	/* OK, we have a feasible combination, tell the caller the solution */
	for (i = 0; i < n_ev; ++i)
662
		event_id[i] = cpuhw->alternatives[i][choice[i]];
663 664 665
	return 0;
}

666
/*
667
 * Check if newly-added events have consistent settings for
668
 * exclude_{user,kernel,hv} with each other and any previously
669
 * added events.
670
 */
671
static int check_excludes(struct perf_event **ctrs, unsigned int cflags[],
672
			  int n_prev, int n_new)
673
{
674 675
	int eu = 0, ek = 0, eh = 0;
	int i, n, first;
676
	struct perf_event *event;
677 678 679 680 681

	n = n_prev + n_new;
	if (n <= 1)
		return 0;

682 683 684 685 686 687
	first = 1;
	for (i = 0; i < n; ++i) {
		if (cflags[i] & PPMU_LIMITED_PMC_OK) {
			cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
			continue;
		}
688
		event = ctrs[i];
689
		if (first) {
690 691 692
			eu = event->attr.exclude_user;
			ek = event->attr.exclude_kernel;
			eh = event->attr.exclude_hv;
693
			first = 0;
694 695 696
		} else if (event->attr.exclude_user != eu ||
			   event->attr.exclude_kernel != ek ||
			   event->attr.exclude_hv != eh) {
697
			return -EAGAIN;
698
		}
699
	}
700 701 702 703 704 705

	if (eu || ek || eh)
		for (i = 0; i < n; ++i)
			if (cflags[i] & PPMU_LIMITED_PMC_OK)
				cflags[i] |= PPMU_LIMITED_PMC_REQD;

706 707 708
	return 0;
}

709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
static u64 check_and_compute_delta(u64 prev, u64 val)
{
	u64 delta = (val - prev) & 0xfffffffful;

	/*
	 * POWER7 can roll back counter values, if the new value is smaller
	 * than the previous value it will cause the delta and the counter to
	 * have bogus values unless we rolled a counter over.  If a coutner is
	 * rolled back, it will be smaller, but within 256, which is the maximum
	 * number of events to rollback at once.  If we dectect a rollback
	 * return 0.  This can lead to a small lack of precision in the
	 * counters.
	 */
	if (prev > val && (prev - val) < 256)
		delta = 0;

	return delta;
}

728
static void power_pmu_read(struct perf_event *event)
729
{
730
	s64 val, delta, prev;
731

P
Peter Zijlstra 已提交
732 733 734
	if (event->hw.state & PERF_HES_STOPPED)
		return;

735
	if (!event->hw.idx)
736 737 738 739 740 741 742
		return;
	/*
	 * Performance monitor interrupts come even when interrupts
	 * are soft-disabled, as long as interrupts are hard-enabled.
	 * Therefore we treat them like NMIs.
	 */
	do {
743
		prev = local64_read(&event->hw.prev_count);
744
		barrier();
745
		val = read_pmc(event->hw.idx);
746 747 748
		delta = check_and_compute_delta(prev, val);
		if (!delta)
			return;
749
	} while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
750

751 752
	local64_add(delta, &event->count);
	local64_sub(delta, &event->hw.period_left);
753 754
}

755 756 757
/*
 * On some machines, PMC5 and PMC6 can't be written, don't respect
 * the freeze conditions, and don't generate interrupts.  This tells
758
 * us if `event' is using such a PMC.
759 760 761
 */
static int is_limited_pmc(int pmcnum)
{
762 763
	return (ppmu->flags & PPMU_LIMITED_PMC5_6)
		&& (pmcnum == 5 || pmcnum == 6);
764 765
}

766
static void freeze_limited_counters(struct cpu_hw_events *cpuhw,
767 768
				    unsigned long pmc5, unsigned long pmc6)
{
769
	struct perf_event *event;
770 771 772 773
	u64 val, prev, delta;
	int i;

	for (i = 0; i < cpuhw->n_limited; ++i) {
774
		event = cpuhw->limited_counter[i];
775
		if (!event->hw.idx)
776
			continue;
777
		val = (event->hw.idx == 5) ? pmc5 : pmc6;
778
		prev = local64_read(&event->hw.prev_count);
779
		event->hw.idx = 0;
780 781 782
		delta = check_and_compute_delta(prev, val);
		if (delta)
			local64_add(delta, &event->count);
783 784 785
	}
}

786
static void thaw_limited_counters(struct cpu_hw_events *cpuhw,
787 788
				  unsigned long pmc5, unsigned long pmc6)
{
789
	struct perf_event *event;
790
	u64 val, prev;
791 792 793
	int i;

	for (i = 0; i < cpuhw->n_limited; ++i) {
794
		event = cpuhw->limited_counter[i];
795 796
		event->hw.idx = cpuhw->limited_hwidx[i];
		val = (event->hw.idx == 5) ? pmc5 : pmc6;
797 798 799
		prev = local64_read(&event->hw.prev_count);
		if (check_and_compute_delta(prev, val))
			local64_set(&event->hw.prev_count, val);
800
		perf_event_update_userpage(event);
801 802 803 804
	}
}

/*
805
 * Since limited events don't respect the freeze conditions, we
806
 * have to read them immediately after freezing or unfreezing the
807 808
 * other events.  We try to keep the values from the limited
 * events as consistent as possible by keeping the delay (in
809
 * cycles and instructions) between freezing/unfreezing and reading
810 811
 * the limited events as small and consistent as possible.
 * Therefore, if any limited events are in use, we read them
812 813 814
 * both, and always in the same order, to minimize variability,
 * and do it inside the same asm that writes MMCR0.
 */
815
static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
816 817 818 819 820 821 822 823 824 825
{
	unsigned long pmc5, pmc6;

	if (!cpuhw->n_limited) {
		mtspr(SPRN_MMCR0, mmcr0);
		return;
	}

	/*
	 * Write MMCR0, then read PMC5 and PMC6 immediately.
826 827
	 * To ensure we don't get a performance monitor interrupt
	 * between writing MMCR0 and freezing/thawing the limited
828
	 * events, we first write MMCR0 with the event overflow
829
	 * interrupt enable bits turned off.
830 831 832
	 */
	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
		     : "=&r" (pmc5), "=&r" (pmc6)
833 834
		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
		       "i" (SPRN_MMCR0),
835 836 837
		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));

	if (mmcr0 & MMCR0_FC)
838
		freeze_limited_counters(cpuhw, pmc5, pmc6);
839
	else
840
		thaw_limited_counters(cpuhw, pmc5, pmc6);
841 842

	/*
843
	 * Write the full MMCR0 including the event overflow interrupt
844 845 846 847
	 * enable bits, if necessary.
	 */
	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
		mtspr(SPRN_MMCR0, mmcr0);
848 849
}

850
/*
851 852
 * Disable all events to prevent PMU interrupts and to allow
 * events to be added or removed.
853
 */
P
Peter Zijlstra 已提交
854
static void power_pmu_disable(struct pmu *pmu)
855
{
856
	struct cpu_hw_events *cpuhw;
857
	unsigned long flags, val;
858

859 860
	if (!ppmu)
		return;
861
	local_irq_save(flags);
862
	cpuhw = &__get_cpu_var(cpu_hw_events);
863

864
	if (!cpuhw->disabled) {
865 866 867 868
		/*
		 * Check if we ever enabled the PMU on this cpu.
		 */
		if (!cpuhw->pmcs_enabled) {
869
			ppc_enable_pmcs();
870 871 872
			cpuhw->pmcs_enabled = 1;
		}

873
		/*
874
		 * Set the 'freeze counters' bit, clear PMAO/FC56.
875 876 877
		 */
		val  = mfspr(SPRN_MMCR0);
		val |= MMCR0_FC;
878
		val &= ~(MMCR0_PMAO | MMCR0_FC56);
879 880 881 882 883 884 885 886 887

		/*
		 * The barrier is to make sure the mtspr has been
		 * executed and the PMU has frozen the events etc.
		 * before we return.
		 */
		write_mmcr0(cpuhw, val);
		mb();

888 889 890 891 892 893 894 895 896
		/*
		 * Disable instruction sampling if it was enabled
		 */
		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
			mtspr(SPRN_MMCRA,
			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
			mb();
		}

897 898
		cpuhw->disabled = 1;
		cpuhw->n_added = 0;
899 900 901 902 903
	}
	local_irq_restore(flags);
}

/*
904 905
 * Re-enable all events if disable == 0.
 * If we were previously disabled and events were added, then
906 907
 * put the new config on the PMU.
 */
P
Peter Zijlstra 已提交
908
static void power_pmu_enable(struct pmu *pmu)
909
{
910 911
	struct perf_event *event;
	struct cpu_hw_events *cpuhw;
912 913 914 915
	unsigned long flags;
	long i;
	unsigned long val;
	s64 left;
916
	unsigned int hwc_index[MAX_HWEVENTS];
917 918
	int n_lim;
	int idx;
919

920 921
	if (!ppmu)
		return;
922

923
	local_irq_save(flags);
924

925
	cpuhw = &__get_cpu_var(cpu_hw_events);
926 927 928
	if (!cpuhw->disabled)
		goto out;

929 930 931
	cpuhw->disabled = 0;

	/*
932
	 * If we didn't change anything, or only removed events,
933 934
	 * no need to recalculate MMCR* settings and reset the PMCs.
	 * Just reenable the PMU with the current MMCR* settings
935
	 * (possibly updated for removal of events).
936 937
	 */
	if (!cpuhw->n_added) {
938
		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
939
		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
940
		if (cpuhw->n_events == 0)
941
			ppc_set_pmu_inuse(0);
942
		goto out_enable;
943 944 945
	}

	/*
946
	 * Compute MMCR* values for the new set of events
947
	 */
948
	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
949 950 951 952 953 954
			       cpuhw->mmcr)) {
		/* shouldn't ever get here */
		printk(KERN_ERR "oops compute_mmcr failed\n");
		goto out;
	}

955 956
	/*
	 * Add in MMCR0 freeze bits corresponding to the
957 958 959
	 * attr.exclude_* bits for the first event.
	 * We have already checked that all events have the
	 * same values for these bits as the first event.
960
	 */
961 962
	event = cpuhw->event[0];
	if (event->attr.exclude_user)
963
		cpuhw->mmcr[0] |= MMCR0_FCP;
964 965 966
	if (event->attr.exclude_kernel)
		cpuhw->mmcr[0] |= freeze_events_kernel;
	if (event->attr.exclude_hv)
967 968
		cpuhw->mmcr[0] |= MMCR0_FCHV;

969 970
	/*
	 * Write the new configuration to MMCR* with the freeze
971 972
	 * bit set and set the hardware events to their initial values.
	 * Then unfreeze the events.
973
	 */
974
	ppc_set_pmu_inuse(1);
975
	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
976 977 978 979 980
	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
				| MMCR0_FC);

	/*
981
	 * Read off any pre-existing events that need to move
982 983
	 * to another PMC.
	 */
984 985 986 987 988 989
	for (i = 0; i < cpuhw->n_events; ++i) {
		event = cpuhw->event[i];
		if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) {
			power_pmu_read(event);
			write_pmc(event->hw.idx, 0);
			event->hw.idx = 0;
990 991 992 993
		}
	}

	/*
994
	 * Initialize the PMCs for all the new and moved events.
995
	 */
996
	cpuhw->n_limited = n_lim = 0;
997 998 999
	for (i = 0; i < cpuhw->n_events; ++i) {
		event = cpuhw->event[i];
		if (event->hw.idx)
1000
			continue;
1001 1002
		idx = hwc_index[i] + 1;
		if (is_limited_pmc(idx)) {
1003
			cpuhw->limited_counter[n_lim] = event;
1004 1005 1006 1007
			cpuhw->limited_hwidx[n_lim] = idx;
			++n_lim;
			continue;
		}
1008
		val = 0;
1009
		if (event->hw.sample_period) {
1010
			left = local64_read(&event->hw.period_left);
1011 1012 1013
			if (left < 0x80000000L)
				val = 0x80000000L - left;
		}
1014
		local64_set(&event->hw.prev_count, val);
1015
		event->hw.idx = idx;
P
Peter Zijlstra 已提交
1016 1017
		if (event->hw.state & PERF_HES_STOPPED)
			val = 0;
1018
		write_pmc(idx, val);
1019
		perf_event_update_userpage(event);
1020
	}
1021
	cpuhw->n_limited = n_lim;
1022
	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
1023 1024 1025

 out_enable:
	mb();
1026
	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1027

1028 1029 1030 1031 1032 1033 1034 1035
	/*
	 * Enable instruction sampling if necessary
	 */
	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
		mb();
		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
	}

1036
 out:
1037 1038 1039
	if (cpuhw->bhrb_users)
		ppmu->config_bhrb(cpuhw->bhrb_filter);

1040 1041 1042
	local_irq_restore(flags);
}

1043 1044
static int collect_events(struct perf_event *group, int max_count,
			  struct perf_event *ctrs[], u64 *events,
1045
			  unsigned int *flags)
1046 1047
{
	int n = 0;
1048
	struct perf_event *event;
1049

1050
	if (!is_software_event(group)) {
1051 1052 1053
		if (n >= max_count)
			return -1;
		ctrs[n] = group;
1054
		flags[n] = group->hw.event_base;
1055 1056
		events[n++] = group->hw.config;
	}
1057
	list_for_each_entry(event, &group->sibling_list, group_entry) {
1058 1059
		if (!is_software_event(event) &&
		    event->state != PERF_EVENT_STATE_OFF) {
1060 1061
			if (n >= max_count)
				return -1;
1062 1063 1064
			ctrs[n] = event;
			flags[n] = event->hw.event_base;
			events[n++] = event->hw.config;
1065 1066 1067 1068 1069 1070
		}
	}
	return n;
}

/*
1071 1072
 * Add a event to the PMU.
 * If all events are not already frozen, then we disable and
1073
 * re-enable the PMU in order to get hw_perf_enable to do the
1074 1075
 * actual work of reconfiguring the PMU.
 */
P
Peter Zijlstra 已提交
1076
static int power_pmu_add(struct perf_event *event, int ef_flags)
1077
{
1078
	struct cpu_hw_events *cpuhw;
1079 1080 1081 1082 1083
	unsigned long flags;
	int n0;
	int ret = -EAGAIN;

	local_irq_save(flags);
P
Peter Zijlstra 已提交
1084
	perf_pmu_disable(event->pmu);
1085 1086

	/*
1087
	 * Add the event to the list (if there is room)
1088 1089
	 * and check whether the total set is still feasible.
	 */
1090 1091
	cpuhw = &__get_cpu_var(cpu_hw_events);
	n0 = cpuhw->n_events;
1092
	if (n0 >= ppmu->n_counter)
1093
		goto out;
1094 1095 1096
	cpuhw->event[n0] = event;
	cpuhw->events[n0] = event->hw.config;
	cpuhw->flags[n0] = event->hw.event_base;
1097

1098 1099 1100 1101 1102 1103
	/*
	 * This event may have been disabled/stopped in record_and_restart()
	 * because we exceeded the ->event_limit. If re-starting the event,
	 * clear the ->hw.state (STOPPED and UPTODATE flags), so the user
	 * notification is re-enabled.
	 */
P
Peter Zijlstra 已提交
1104 1105
	if (!(ef_flags & PERF_EF_START))
		event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
1106 1107
	else
		event->hw.state = 0;
P
Peter Zijlstra 已提交
1108

1109 1110
	/*
	 * If group events scheduling transaction was started,
L
Lucas De Marchi 已提交
1111
	 * skip the schedulability test here, it will be performed
1112 1113
	 * at commit time(->commit_txn) as a whole
	 */
1114
	if (cpuhw->group_flag & PERF_EVENT_TXN)
1115 1116
		goto nocheck;

1117
	if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
1118
		goto out;
1119
	if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
1120
		goto out;
1121
	event->hw.config = cpuhw->events[n0];
1122 1123

nocheck:
1124
	++cpuhw->n_events;
1125 1126 1127 1128
	++cpuhw->n_added;

	ret = 0;
 out:
1129 1130 1131
	if (has_branch_stack(event))
		power_pmu_bhrb_enable(event);

P
Peter Zijlstra 已提交
1132
	perf_pmu_enable(event->pmu);
1133 1134 1135 1136 1137
	local_irq_restore(flags);
	return ret;
}

/*
1138
 * Remove a event from the PMU.
1139
 */
P
Peter Zijlstra 已提交
1140
static void power_pmu_del(struct perf_event *event, int ef_flags)
1141
{
1142
	struct cpu_hw_events *cpuhw;
1143 1144 1145 1146
	long i;
	unsigned long flags;

	local_irq_save(flags);
P
Peter Zijlstra 已提交
1147
	perf_pmu_disable(event->pmu);
1148

1149 1150 1151 1152 1153
	power_pmu_read(event);

	cpuhw = &__get_cpu_var(cpu_hw_events);
	for (i = 0; i < cpuhw->n_events; ++i) {
		if (event == cpuhw->event[i]) {
1154
			while (++i < cpuhw->n_events) {
1155
				cpuhw->event[i-1] = cpuhw->event[i];
1156 1157 1158
				cpuhw->events[i-1] = cpuhw->events[i];
				cpuhw->flags[i-1] = cpuhw->flags[i];
			}
1159 1160 1161 1162 1163
			--cpuhw->n_events;
			ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr);
			if (event->hw.idx) {
				write_pmc(event->hw.idx, 0);
				event->hw.idx = 0;
1164
			}
1165
			perf_event_update_userpage(event);
1166 1167 1168
			break;
		}
	}
1169
	for (i = 0; i < cpuhw->n_limited; ++i)
1170
		if (event == cpuhw->limited_counter[i])
1171 1172 1173
			break;
	if (i < cpuhw->n_limited) {
		while (++i < cpuhw->n_limited) {
1174
			cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
1175 1176 1177 1178
			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
		}
		--cpuhw->n_limited;
	}
1179 1180
	if (cpuhw->n_events == 0) {
		/* disable exceptions if no events are running */
1181 1182 1183
		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
	}

1184 1185 1186
	if (has_branch_stack(event))
		power_pmu_bhrb_disable(event);

P
Peter Zijlstra 已提交
1187
	perf_pmu_enable(event->pmu);
1188 1189 1190
	local_irq_restore(flags);
}

1191
/*
P
Peter Zijlstra 已提交
1192 1193
 * POWER-PMU does not support disabling individual counters, hence
 * program their cycle counter to their max value and ignore the interrupts.
1194
 */
P
Peter Zijlstra 已提交
1195 1196

static void power_pmu_start(struct perf_event *event, int ef_flags)
1197 1198
{
	unsigned long flags;
P
Peter Zijlstra 已提交
1199
	s64 left;
1200
	unsigned long val;
1201

1202
	if (!event->hw.idx || !event->hw.sample_period)
1203
		return;
P
Peter Zijlstra 已提交
1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215

	if (!(event->hw.state & PERF_HES_STOPPED))
		return;

	if (ef_flags & PERF_EF_RELOAD)
		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));

	local_irq_save(flags);
	perf_pmu_disable(event->pmu);

	event->hw.state = 0;
	left = local64_read(&event->hw.period_left);
1216 1217 1218 1219 1220 1221

	val = 0;
	if (left < 0x80000000L)
		val = 0x80000000L - left;

	write_pmc(event->hw.idx, val);
P
Peter Zijlstra 已提交
1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237

	perf_event_update_userpage(event);
	perf_pmu_enable(event->pmu);
	local_irq_restore(flags);
}

static void power_pmu_stop(struct perf_event *event, int ef_flags)
{
	unsigned long flags;

	if (!event->hw.idx || !event->hw.sample_period)
		return;

	if (event->hw.state & PERF_HES_STOPPED)
		return;

1238
	local_irq_save(flags);
P
Peter Zijlstra 已提交
1239
	perf_pmu_disable(event->pmu);
P
Peter Zijlstra 已提交
1240

1241
	power_pmu_read(event);
P
Peter Zijlstra 已提交
1242 1243 1244
	event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
	write_pmc(event->hw.idx, 0);

1245
	perf_event_update_userpage(event);
P
Peter Zijlstra 已提交
1246
	perf_pmu_enable(event->pmu);
1247 1248 1249
	local_irq_restore(flags);
}

1250 1251 1252 1253 1254
/*
 * Start group events scheduling transaction
 * Set the flag to make pmu::enable() not perform the
 * schedulability test, it will be performed at commit time
 */
P
Peter Zijlstra 已提交
1255
void power_pmu_start_txn(struct pmu *pmu)
1256 1257 1258
{
	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);

P
Peter Zijlstra 已提交
1259
	perf_pmu_disable(pmu);
1260
	cpuhw->group_flag |= PERF_EVENT_TXN;
1261 1262 1263 1264 1265 1266 1267 1268
	cpuhw->n_txn_start = cpuhw->n_events;
}

/*
 * Stop group events scheduling transaction
 * Clear the flag and pmu::enable() will perform the
 * schedulability test.
 */
P
Peter Zijlstra 已提交
1269
void power_pmu_cancel_txn(struct pmu *pmu)
1270 1271 1272
{
	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);

1273
	cpuhw->group_flag &= ~PERF_EVENT_TXN;
P
Peter Zijlstra 已提交
1274
	perf_pmu_enable(pmu);
1275 1276 1277 1278 1279 1280 1281
}

/*
 * Commit group events scheduling transaction
 * Perform the group schedulability test as a whole
 * Return 0 if success
 */
P
Peter Zijlstra 已提交
1282
int power_pmu_commit_txn(struct pmu *pmu)
1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299
{
	struct cpu_hw_events *cpuhw;
	long i, n;

	if (!ppmu)
		return -EAGAIN;
	cpuhw = &__get_cpu_var(cpu_hw_events);
	n = cpuhw->n_events;
	if (check_excludes(cpuhw->event, cpuhw->flags, 0, n))
		return -EAGAIN;
	i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n);
	if (i < 0)
		return -EAGAIN;

	for (i = cpuhw->n_txn_start; i < n; ++i)
		cpuhw->event[i]->hw.config = cpuhw->events[i];

1300
	cpuhw->group_flag &= ~PERF_EVENT_TXN;
P
Peter Zijlstra 已提交
1301
	perf_pmu_enable(pmu);
1302 1303 1304
	return 0;
}

1305
/*
1306
 * Return 1 if we might be able to put event on a limited PMC,
1307
 * or 0 if not.
1308
 * A event can only go on a limited PMC if it counts something
1309 1310 1311
 * that a limited PMC can count, doesn't require interrupts, and
 * doesn't exclude any processor mode.
 */
1312
static int can_go_on_limited_pmc(struct perf_event *event, u64 ev,
1313 1314 1315
				 unsigned int flags)
{
	int n;
1316
	u64 alt[MAX_EVENT_ALTERNATIVES];
1317

1318 1319 1320 1321
	if (event->attr.exclude_user
	    || event->attr.exclude_kernel
	    || event->attr.exclude_hv
	    || event->attr.sample_period)
1322 1323 1324 1325 1326 1327
		return 0;

	if (ppmu->limited_pmc_event(ev))
		return 1;

	/*
1328
	 * The requested event_id isn't on a limited PMC already;
1329 1330 1331 1332 1333 1334 1335 1336
	 * see if any alternative code goes on a limited PMC.
	 */
	if (!ppmu->get_alternatives)
		return 0;

	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
	n = ppmu->get_alternatives(ev, flags, alt);

1337
	return n > 0;
1338 1339 1340
}

/*
1341 1342 1343
 * Find an alternative event_id that goes on a normal PMC, if possible,
 * and return the event_id code, or 0 if there is no such alternative.
 * (Note: event_id code 0 is "don't count" on all machines.)
1344
 */
1345
static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
1346
{
1347
	u64 alt[MAX_EVENT_ALTERNATIVES];
1348 1349 1350 1351 1352 1353 1354 1355 1356
	int n;

	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
	n = ppmu->get_alternatives(ev, flags, alt);
	if (!n)
		return 0;
	return alt[0];
}

1357 1358
/* Number of perf_events counting hardware events */
static atomic_t num_events;
1359 1360 1361 1362
/* Used to avoid races in calling reserve/release_pmc_hardware */
static DEFINE_MUTEX(pmc_reserve_mutex);

/*
1363
 * Release the PMU if this is the last perf_event.
1364
 */
1365
static void hw_perf_event_destroy(struct perf_event *event)
1366
{
1367
	if (!atomic_add_unless(&num_events, -1, 1)) {
1368
		mutex_lock(&pmc_reserve_mutex);
1369
		if (atomic_dec_return(&num_events) == 0)
1370 1371 1372 1373 1374
			release_pmc_hardware();
		mutex_unlock(&pmc_reserve_mutex);
	}
}

1375
/*
1376
 * Translate a generic cache event_id config to a raw event_id code.
1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404
 */
static int hw_perf_cache_event(u64 config, u64 *eventp)
{
	unsigned long type, op, result;
	int ev;

	if (!ppmu->cache_events)
		return -EINVAL;

	/* unpack config */
	type = config & 0xff;
	op = (config >> 8) & 0xff;
	result = (config >> 16) & 0xff;

	if (type >= PERF_COUNT_HW_CACHE_MAX ||
	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	ev = (*ppmu->cache_events)[type][op][result];
	if (ev == 0)
		return -EOPNOTSUPP;
	if (ev == -1)
		return -EINVAL;
	*eventp = ev;
	return 0;
}

1405
static int power_pmu_event_init(struct perf_event *event)
1406
{
1407 1408
	u64 ev;
	unsigned long flags;
1409 1410 1411
	struct perf_event *ctrs[MAX_HWEVENTS];
	u64 events[MAX_HWEVENTS];
	unsigned int cflags[MAX_HWEVENTS];
1412
	int n;
1413
	int err;
1414
	struct cpu_hw_events *cpuhw;
1415 1416

	if (!ppmu)
1417 1418
		return -ENOENT;

1419 1420 1421 1422 1423
	if (has_branch_stack(event)) {
	        /* PMU has BHRB enabled */
		if (!(ppmu->flags & PPMU_BHRB))
			return -EOPNOTSUPP;
	}
1424

1425
	switch (event->attr.type) {
1426
	case PERF_TYPE_HARDWARE:
1427
		ev = event->attr.config;
1428
		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
1429
			return -EOPNOTSUPP;
1430
		ev = ppmu->generic_events[ev];
1431 1432
		break;
	case PERF_TYPE_HW_CACHE:
1433
		err = hw_perf_cache_event(event->attr.config, &ev);
1434
		if (err)
1435
			return err;
1436 1437
		break;
	case PERF_TYPE_RAW:
1438
		ev = event->attr.config;
1439
		break;
1440
	default:
1441
		return -ENOENT;
1442
	}
1443

1444 1445
	event->hw.config_base = ev;
	event->hw.idx = 0;
1446

1447 1448 1449
	/*
	 * If we are not running on a hypervisor, force the
	 * exclude_hv bit to 0 so that we don't care what
1450
	 * the user set it to.
1451 1452
	 */
	if (!firmware_has_feature(FW_FEATURE_LPAR))
1453
		event->attr.exclude_hv = 0;
1454 1455

	/*
1456
	 * If this is a per-task event, then we can use
1457 1458 1459 1460 1461
	 * PM_RUN_* events interchangeably with their non RUN_*
	 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
	 * XXX we should check if the task is an idle task.
	 */
	flags = 0;
1462
	if (event->attach_state & PERF_ATTACH_TASK)
1463 1464 1465
		flags |= PPMU_ONLY_COUNT_RUN;

	/*
1466 1467
	 * If this machine has limited events, check whether this
	 * event_id could go on a limited event.
1468
	 */
1469
	if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
1470
		if (can_go_on_limited_pmc(event, ev, flags)) {
1471 1472 1473
			flags |= PPMU_LIMITED_PMC_OK;
		} else if (ppmu->limited_pmc_event(ev)) {
			/*
1474
			 * The requested event_id is on a limited PMC,
1475 1476 1477 1478 1479
			 * but we can't use a limited PMC; see if any
			 * alternative goes on a normal PMC.
			 */
			ev = normal_pmc_alternative(ev, flags);
			if (!ev)
1480
				return -EINVAL;
1481 1482 1483
		}
	}

1484 1485
	/*
	 * If this is in a group, check if it can go on with all the
1486
	 * other hardware events in the group.  We assume the event
1487 1488 1489
	 * hasn't been linked into its leader's sibling list at this point.
	 */
	n = 0;
1490
	if (event->group_leader != event) {
1491
		n = collect_events(event->group_leader, ppmu->n_counter - 1,
1492
				   ctrs, events, cflags);
1493
		if (n < 0)
1494
			return -EINVAL;
1495
	}
1496
	events[n] = ev;
1497
	ctrs[n] = event;
1498 1499
	cflags[n] = flags;
	if (check_excludes(ctrs, cflags, n, 1))
1500
		return -EINVAL;
1501

1502
	cpuhw = &get_cpu_var(cpu_hw_events);
1503
	err = power_check_constraints(cpuhw, events, cflags, n + 1);
1504 1505 1506 1507 1508 1509 1510 1511 1512

	if (has_branch_stack(event)) {
		cpuhw->bhrb_filter = ppmu->bhrb_filter_map(
					event->attr.branch_sample_type);

		if(cpuhw->bhrb_filter == -1)
			return -EOPNOTSUPP;
	}

1513
	put_cpu_var(cpu_hw_events);
1514
	if (err)
1515
		return -EINVAL;
1516

1517 1518 1519
	event->hw.config = events[n];
	event->hw.event_base = cflags[n];
	event->hw.last_period = event->hw.sample_period;
1520
	local64_set(&event->hw.period_left, event->hw.last_period);
1521 1522 1523

	/*
	 * See if we need to reserve the PMU.
1524
	 * If no events are currently in use, then we have to take a
1525 1526 1527 1528
	 * mutex to ensure that we don't race with another task doing
	 * reserve_pmc_hardware or release_pmc_hardware.
	 */
	err = 0;
1529
	if (!atomic_inc_not_zero(&num_events)) {
1530
		mutex_lock(&pmc_reserve_mutex);
1531 1532
		if (atomic_read(&num_events) == 0 &&
		    reserve_pmc_hardware(perf_event_interrupt))
1533 1534
			err = -EBUSY;
		else
1535
			atomic_inc(&num_events);
1536 1537
		mutex_unlock(&pmc_reserve_mutex);
	}
1538
	event->destroy = hw_perf_event_destroy;
1539

1540
	return err;
1541 1542
}

1543 1544 1545 1546 1547
static int power_pmu_event_idx(struct perf_event *event)
{
	return event->hw.idx;
}

1548 1549 1550 1551 1552 1553 1554 1555 1556 1557
ssize_t power_events_sysfs_show(struct device *dev,
				struct device_attribute *attr, char *page)
{
	struct perf_pmu_events_attr *pmu_attr;

	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);

	return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
}

1558
struct pmu power_pmu = {
P
Peter Zijlstra 已提交
1559 1560
	.pmu_enable	= power_pmu_enable,
	.pmu_disable	= power_pmu_disable,
1561
	.event_init	= power_pmu_event_init,
P
Peter Zijlstra 已提交
1562 1563 1564 1565
	.add		= power_pmu_add,
	.del		= power_pmu_del,
	.start		= power_pmu_start,
	.stop		= power_pmu_stop,
1566 1567 1568 1569
	.read		= power_pmu_read,
	.start_txn	= power_pmu_start_txn,
	.cancel_txn	= power_pmu_cancel_txn,
	.commit_txn	= power_pmu_commit_txn,
1570
	.event_idx	= power_pmu_event_idx,
1571
	.flush_branch_stack = power_pmu_flush_branch_stack,
1572 1573
};

1574
/*
I
Ingo Molnar 已提交
1575
 * A counter has overflowed; update its count and record
1576 1577 1578
 * things if requested.  Note that interrupts are hard-disabled
 * here so there is no possibility of being interrupted.
 */
1579
static void record_and_restart(struct perf_event *event, unsigned long val,
1580
			       struct pt_regs *regs)
1581
{
1582
	u64 period = event->hw.sample_period;
1583 1584 1585
	s64 prev, delta, left;
	int record = 0;

P
Peter Zijlstra 已提交
1586 1587 1588 1589 1590
	if (event->hw.state & PERF_HES_STOPPED) {
		write_pmc(event->hw.idx, 0);
		return;
	}

1591
	/* we don't have to worry about interrupts here */
1592
	prev = local64_read(&event->hw.prev_count);
1593
	delta = check_and_compute_delta(prev, val);
1594
	local64_add(delta, &event->count);
1595 1596

	/*
1597
	 * See if the total period for this event has expired,
1598 1599 1600
	 * and update for the next period.
	 */
	val = 0;
1601
	left = local64_read(&event->hw.period_left) - delta;
1602 1603
	if (delta == 0)
		left++;
1604
	if (period) {
1605
		if (left <= 0) {
1606
			left += period;
1607
			if (left <= 0)
1608
				left = period;
1609
			record = siar_valid(regs);
1610
			event->hw.last_period = event->hw.sample_period;
1611
		}
1612 1613
		if (left < 0x80000000LL)
			val = 0x80000000LL - left;
1614 1615
	}

P
Peter Zijlstra 已提交
1616 1617 1618 1619 1620
	write_pmc(event->hw.idx, val);
	local64_set(&event->hw.prev_count, val);
	local64_set(&event->hw.period_left, left);
	perf_event_update_userpage(event);

1621 1622 1623
	/*
	 * Finally record data if requested.
	 */
1624
	if (record) {
1625 1626
		struct perf_sample_data data;

1627
		perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
1628

1629
		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
1630 1631
			perf_get_data_addr(regs, &data.addr);

1632 1633 1634 1635 1636 1637 1638
		if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {
			struct cpu_hw_events *cpuhw;
			cpuhw = &__get_cpu_var(cpu_hw_events);
			power_pmu_bhrb_read(cpuhw);
			data.br_stack = &cpuhw->bhrb_stack;
		}

1639
		if (perf_event_overflow(event, &data, regs))
P
Peter Zijlstra 已提交
1640
			power_pmu_stop(event, 0);
1641 1642 1643 1644 1645
	}
}

/*
 * Called from generic code to get the misc flags (i.e. processor mode)
1646
 * for an event_id.
1647 1648 1649
 */
unsigned long perf_misc_flags(struct pt_regs *regs)
{
1650
	u32 flags = perf_get_misc_flags(regs);
1651

1652 1653
	if (flags)
		return flags;
1654 1655
	return user_mode(regs) ? PERF_RECORD_MISC_USER :
		PERF_RECORD_MISC_KERNEL;
1656 1657 1658 1659
}

/*
 * Called from generic code to get the instruction pointer
1660
 * for an event_id.
1661 1662 1663
 */
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
1664
	bool use_siar = regs_use_siar(regs);
1665

1666
	if (use_siar && siar_valid(regs))
1667
		return mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
1668 1669
	else if (use_siar)
		return 0;		// no valid instruction pointer
1670
	else
1671
		return regs->nip;
1672 1673
}

1674
static bool pmc_overflow_power7(unsigned long val)
1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686
{
	/*
	 * Events on POWER7 can roll back if a speculative event doesn't
	 * eventually complete. Unfortunately in some rare cases they will
	 * raise a performance monitor exception. We need to catch this to
	 * ensure we reset the PMC. In all cases the PMC will be 256 or less
	 * cycles from overflow.
	 *
	 * We only do this if the first pass fails to find any overflowing
	 * PMCs because a user might set a period of less than 256 and we
	 * don't want to mistakenly reset them.
	 */
1687 1688 1689 1690 1691 1692 1693 1694 1695
	if ((0x80000000 - val) <= 256)
		return true;

	return false;
}

static bool pmc_overflow(unsigned long val)
{
	if ((int)val < 0)
1696 1697 1698 1699 1700
		return true;

	return false;
}

1701 1702 1703
/*
 * Performance monitor interrupt stuff
 */
1704
static void perf_event_interrupt(struct pt_regs *regs)
1705
{
1706
	int i, j;
1707 1708
	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
	struct perf_event *event;
1709 1710
	unsigned long val[8];
	int found, active;
1711 1712
	int nmi;

1713
	if (cpuhw->n_limited)
1714
		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
1715 1716
					mfspr(SPRN_PMC6));

1717
	perf_read_regs(regs);
1718

1719
	nmi = perf_intr_is_nmi(regs);
1720 1721 1722 1723
	if (nmi)
		nmi_enter();
	else
		irq_enter();
1724

1725 1726 1727 1728 1729 1730 1731 1732
	/* Read all the PMCs since we'll need them a bunch of times */
	for (i = 0; i < ppmu->n_counter; ++i)
		val[i] = read_pmc(i + 1);

	/* Try to find what caused the IRQ */
	found = 0;
	for (i = 0; i < ppmu->n_counter; ++i) {
		if (!pmc_overflow(val[i]))
1733
			continue;
1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749
		if (is_limited_pmc(i + 1))
			continue; /* these won't generate IRQs */
		/*
		 * We've found one that's overflowed.  For active
		 * counters we need to log this.  For inactive
		 * counters, we need to reset it anyway
		 */
		found = 1;
		active = 0;
		for (j = 0; j < cpuhw->n_events; ++j) {
			event = cpuhw->event[j];
			if (event->hw.idx == (i + 1)) {
				active = 1;
				record_and_restart(event, val[i], regs);
				break;
			}
1750
		}
1751 1752 1753
		if (!active)
			/* reset non active counters that have overflowed */
			write_pmc(i + 1, 0);
1754
	}
1755 1756 1757 1758 1759
	if (!found && pvr_version_is(PVR_POWER7)) {
		/* check active counters for special buggy p7 overflow */
		for (i = 0; i < cpuhw->n_events; ++i) {
			event = cpuhw->event[i];
			if (!event->hw.idx || is_limited_pmc(event->hw.idx))
1760
				continue;
1761 1762 1763 1764 1765 1766 1767
			if (pmc_overflow_power7(val[event->hw.idx - 1])) {
				/* event has overflowed in a buggy way*/
				found = 1;
				record_and_restart(event,
						   val[event->hw.idx - 1],
						   regs);
			}
1768 1769
		}
	}
1770
	if (!found && !nmi && printk_ratelimit())
1771
		printk(KERN_WARNING "Can't find PMC that caused IRQ\n");
1772 1773 1774

	/*
	 * Reset MMCR0 to its normal value.  This will set PMXE and
I
Ingo Molnar 已提交
1775
	 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
1776
	 * and thus allow interrupts to occur again.
1777
	 * XXX might want to use MSR.PM to keep the events frozen until
1778 1779
	 * we get back out of this interrupt.
	 */
1780
	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1781

1782 1783 1784
	if (nmi)
		nmi_exit();
	else
1785
		irq_exit();
1786 1787
}

1788
static void power_pmu_setup(int cpu)
1789
{
1790
	struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
1791

1792 1793
	if (!ppmu)
		return;
1794 1795 1796 1797
	memset(cpuhw, 0, sizeof(*cpuhw));
	cpuhw->mmcr[0] = MMCR0_FC;
}

1798
static int
1799
power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814
{
	unsigned int cpu = (long)hcpu;

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
		power_pmu_setup(cpu);
		break;

	default:
		break;
	}

	return NOTIFY_OK;
}

1815
int register_power_pmu(struct power_pmu *pmu)
1816
{
1817 1818 1819 1820 1821 1822
	if (ppmu)
		return -EBUSY;		/* something's already registered */

	ppmu = pmu;
	pr_info("%s performance monitor hardware support registered\n",
		pmu->name);
1823

1824 1825
	power_pmu.attr_groups = ppmu->attr_groups;

1826
#ifdef MSR_HV
1827 1828 1829 1830
	/*
	 * Use FCHV to ignore kernel events if MSR.HV is set.
	 */
	if (mfmsr() & MSR_HV)
1831
		freeze_events_kernel = MMCR0_FCHV;
1832
#endif /* CONFIG_PPC64 */
1833

P
Peter Zijlstra 已提交
1834
	perf_pmu_register(&power_pmu, "cpu", PERF_TYPE_RAW);
1835 1836
	perf_cpu_notifier(power_pmu_notifier);

1837 1838
	return 0;
}