core-book3s.c 38.7 KB
Newer Older
1
/*
2
 * Performance event support - powerpc architecture code
3 4 5 6 7 8 9 10 11 12
 *
 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */
#include <linux/kernel.h>
#include <linux/sched.h>
13
#include <linux/perf_event.h>
14 15 16 17
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <asm/reg.h>
#include <asm/pmc.h>
18
#include <asm/machdep.h>
19
#include <asm/firmware.h>
20
#include <asm/ptrace.h>
21

22 23
struct cpu_hw_events {
	int n_events;
24 25 26
	int n_percpu;
	int disabled;
	int n_added;
27 28
	int n_limited;
	u8  pmcs_enabled;
29 30 31
	struct perf_event *event[MAX_HWEVENTS];
	u64 events[MAX_HWEVENTS];
	unsigned int flags[MAX_HWEVENTS];
32
	unsigned long mmcr[3];
33 34
	struct perf_event *limited_counter[MAX_LIMITED_HWCOUNTERS];
	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
35 36 37
	u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
	unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
	unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
38 39 40

	unsigned int group_flag;
	int n_txn_start;
41
};
42
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
43 44 45

struct power_pmu *ppmu;

46
/*
I
Ingo Molnar 已提交
47
 * Normally, to ignore kernel events we set the FCS (freeze counters
48 49 50 51 52
 * in supervisor mode) bit in MMCR0, but if the kernel runs with the
 * hypervisor bit set in the MSR, or if we are running on a processor
 * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
 * then we need to use the FCHV bit to ignore kernel events.
 */
53
static unsigned int freeze_events_kernel = MMCR0_FCS;
54

55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
/*
 * 32-bit doesn't have MMCRA but does have an MMCR2,
 * and a few other names are different.
 */
#ifdef CONFIG_PPC32

#define MMCR0_FCHV		0
#define MMCR0_PMCjCE		MMCR0_PMCnCE

#define SPRN_MMCRA		SPRN_MMCR2
#define MMCRA_SAMPLE_ENABLE	0

static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
{
	return 0;
}
static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
static inline u32 perf_get_misc_flags(struct pt_regs *regs)
{
	return 0;
}
76 77 78 79
static inline void perf_read_regs(struct pt_regs *regs)
{
	regs->result = 0;
}
80 81 82 83 84
static inline int perf_intr_is_nmi(struct pt_regs *regs)
{
	return 0;
}

85 86 87 88 89
static inline int siar_valid(struct pt_regs *regs)
{
	return 1;
}

90 91
#endif /* CONFIG_PPC32 */

92 93 94 95 96
static bool regs_use_siar(struct pt_regs *regs)
{
	return !!(regs->result & 1);
}

97 98 99 100 101 102 103 104 105
/*
 * Things that are specific to 64-bit implementations.
 */
#ifdef CONFIG_PPC64

static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
{
	unsigned long mmcra = regs->dsisr;

106
	if ((ppmu->flags & PPMU_HAS_SSLOT) && (mmcra & MMCRA_SAMPLE_ENABLE)) {
107 108 109 110
		unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
		if (slot > 1)
			return 4 * (slot - 1);
	}
111

112 113 114 115 116 117 118 119
	return 0;
}

/*
 * The user wants a data address recorded.
 * If we're not doing instruction sampling, give them the SDAR
 * (sampled data address).  If we are doing instruction sampling, then
 * only give them the SDAR if it corresponds to the instruction
120 121
 * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC or
 * the [POWER7P_]MMCRA_SDAR_VALID bit in MMCRA.
122 123 124 125
 */
static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
{
	unsigned long mmcra = regs->dsisr;
126 127 128 129 130 131 132 133
	unsigned long sdsync;

	if (ppmu->flags & PPMU_SIAR_VALID)
		sdsync = POWER7P_MMCRA_SDAR_VALID;
	else if (ppmu->flags & PPMU_ALT_SIPR)
		sdsync = POWER6_MMCRA_SDSYNC;
	else
		sdsync = MMCRA_SDSYNC;
134 135 136 137 138

	if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
		*addrp = mfspr(SPRN_SDAR);
}

139
static bool regs_sihv(struct pt_regs *regs)
140 141 142 143 144 145
{
	unsigned long sihv = MMCRA_SIHV;

	if (ppmu->flags & PPMU_ALT_SIPR)
		sihv = POWER6_MMCRA_SIHV;

146
	return !!(regs->dsisr & sihv);
147 148
}

149
static bool regs_sipr(struct pt_regs *regs)
150 151 152 153 154 155
{
	unsigned long sipr = MMCRA_SIPR;

	if (ppmu->flags & PPMU_ALT_SIPR)
		sipr = POWER6_MMCRA_SIPR;

156
	return !!(regs->dsisr & sipr);
157 158
}

159 160 161 162 163
static bool regs_no_sipr(struct pt_regs *regs)
{
	return !!(regs->result & 2);
}

164 165 166 167 168 169 170 171 172
static inline u32 perf_flags_from_msr(struct pt_regs *regs)
{
	if (regs->msr & MSR_PR)
		return PERF_RECORD_MISC_USER;
	if ((regs->msr & MSR_HV) && freeze_events_kernel != MMCR0_FCHV)
		return PERF_RECORD_MISC_HYPERVISOR;
	return PERF_RECORD_MISC_KERNEL;
}

173 174
static inline u32 perf_get_misc_flags(struct pt_regs *regs)
{
175
	bool use_siar = regs_use_siar(regs);
176

177
	if (!use_siar)
178 179 180 181 182 183 184 185
		return perf_flags_from_msr(regs);

	/*
	 * If we don't have flags in MMCRA, rather than using
	 * the MSR, we intuit the flags from the address in
	 * SIAR which should give slightly more reliable
	 * results
	 */
186
	if (regs_no_sipr(regs)) {
187 188 189 190 191
		unsigned long siar = mfspr(SPRN_SIAR);
		if (siar >= PAGE_OFFSET)
			return PERF_RECORD_MISC_KERNEL;
		return PERF_RECORD_MISC_USER;
	}
192

193
	/* PR has priority over HV, so order below is important */
194
	if (regs_sipr(regs))
195
		return PERF_RECORD_MISC_USER;
196 197

	if (regs_sihv(regs) && (freeze_events_kernel != MMCR0_FCHV))
198
		return PERF_RECORD_MISC_HYPERVISOR;
199

200
	return PERF_RECORD_MISC_KERNEL;
201 202 203 204 205
}

/*
 * Overload regs->dsisr to store MMCRA so we only need to read it once
 * on each interrupt.
206 207
 * Overload regs->result to specify whether we should use the MSR (result
 * is zero) or the SIAR (result is non zero).
208 209 210
 */
static inline void perf_read_regs(struct pt_regs *regs)
{
211 212 213 214
	unsigned long mmcra = mfspr(SPRN_MMCRA);
	int marked = mmcra & MMCRA_SAMPLE_ENABLE;
	int use_siar;

215
	regs->dsisr = mmcra;
216 217 218 219
	regs->result = 0;

	if (ppmu->flags & PPMU_NO_SIPR)
		regs->result |= 2;
220

221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
	/*
	 * If this isn't a PMU exception (eg a software event) the SIAR is
	 * not valid. Use pt_regs.
	 *
	 * If it is a marked event use the SIAR.
	 *
	 * If the PMU doesn't update the SIAR for non marked events use
	 * pt_regs.
	 *
	 * If the PMU has HV/PR flags then check to see if they
	 * place the exception in userspace. If so, use pt_regs. In
	 * continuous sampling mode the SIAR and the PMU exception are
	 * not synchronised, so they may be many instructions apart.
	 * This can result in confusing backtraces. We still want
	 * hypervisor samples as well as samples in the kernel with
	 * interrupts off hence the userspace check.
	 */
238 239
	if (TRAP(regs) != 0xf00)
		use_siar = 0;
240 241 242 243
	else if (marked)
		use_siar = 1;
	else if ((ppmu->flags & PPMU_NO_CONT_SAMPLING))
		use_siar = 0;
244
	else if (!regs_no_sipr(regs) && regs_sipr(regs))
245 246 247 248
		use_siar = 0;
	else
		use_siar = 1;

249
	regs->result |= use_siar;
250 251 252 253 254 255 256 257 258 259 260
}

/*
 * If interrupts were soft-disabled when a PMU interrupt occurs, treat
 * it as an NMI.
 */
static inline int perf_intr_is_nmi(struct pt_regs *regs)
{
	return !regs->softe;
}

261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
/*
 * On processors like P7+ that have the SIAR-Valid bit, marked instructions
 * must be sampled only if the SIAR-valid bit is set.
 *
 * For unmarked instructions and for processors that don't have the SIAR-Valid
 * bit, assume that SIAR is valid.
 */
static inline int siar_valid(struct pt_regs *regs)
{
	unsigned long mmcra = regs->dsisr;
	int marked = mmcra & MMCRA_SAMPLE_ENABLE;

	if ((ppmu->flags & PPMU_SIAR_VALID) && marked)
		return mmcra & POWER7P_MMCRA_SIAR_VALID;

	return 1;
}

279 280
#endif /* CONFIG_PPC64 */

281
static void perf_event_interrupt(struct pt_regs *regs);
282

283
void perf_event_print_debug(void)
284 285 286 287
{
}

/*
I
Ingo Molnar 已提交
288
 * Read one performance monitor counter (PMC).
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312
 */
static unsigned long read_pmc(int idx)
{
	unsigned long val;

	switch (idx) {
	case 1:
		val = mfspr(SPRN_PMC1);
		break;
	case 2:
		val = mfspr(SPRN_PMC2);
		break;
	case 3:
		val = mfspr(SPRN_PMC3);
		break;
	case 4:
		val = mfspr(SPRN_PMC4);
		break;
	case 5:
		val = mfspr(SPRN_PMC5);
		break;
	case 6:
		val = mfspr(SPRN_PMC6);
		break;
313
#ifdef CONFIG_PPC64
314 315 316 317 318 319
	case 7:
		val = mfspr(SPRN_PMC7);
		break;
	case 8:
		val = mfspr(SPRN_PMC8);
		break;
320
#endif /* CONFIG_PPC64 */
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351
	default:
		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
		val = 0;
	}
	return val;
}

/*
 * Write one PMC.
 */
static void write_pmc(int idx, unsigned long val)
{
	switch (idx) {
	case 1:
		mtspr(SPRN_PMC1, val);
		break;
	case 2:
		mtspr(SPRN_PMC2, val);
		break;
	case 3:
		mtspr(SPRN_PMC3, val);
		break;
	case 4:
		mtspr(SPRN_PMC4, val);
		break;
	case 5:
		mtspr(SPRN_PMC5, val);
		break;
	case 6:
		mtspr(SPRN_PMC6, val);
		break;
352
#ifdef CONFIG_PPC64
353 354 355 356 357 358
	case 7:
		mtspr(SPRN_PMC7, val);
		break;
	case 8:
		mtspr(SPRN_PMC8, val);
		break;
359
#endif /* CONFIG_PPC64 */
360 361 362 363 364 365 366 367 368
	default:
		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
	}
}

/*
 * Check if a set of events can all go on the PMU at once.
 * If they can't, this will look at alternative codes for the events
 * and see if any combination of alternative codes is feasible.
369
 * The feasible set is returned in event_id[].
370
 */
371 372
static int power_check_constraints(struct cpu_hw_events *cpuhw,
				   u64 event_id[], unsigned int cflags[],
373
				   int n_ev)
374
{
375
	unsigned long mask, value, nv;
376 377
	unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS];
	int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS];
378
	int i, j;
379 380
	unsigned long addf = ppmu->add_fields;
	unsigned long tadd = ppmu->test_adder;
381

382
	if (n_ev > ppmu->n_counter)
383 384 385 386
		return -1;

	/* First see if the events will go on as-is */
	for (i = 0; i < n_ev; ++i) {
387
		if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
388 389
		    && !ppmu->limited_pmc_event(event_id[i])) {
			ppmu->get_alternatives(event_id[i], cflags[i],
390
					       cpuhw->alternatives[i]);
391
			event_id[i] = cpuhw->alternatives[i][0];
392
		}
393
		if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0],
394
					 &cpuhw->avalues[i][0]))
395 396 397 398
			return -1;
	}
	value = mask = 0;
	for (i = 0; i < n_ev; ++i) {
399 400
		nv = (value | cpuhw->avalues[i][0]) +
			(value & cpuhw->avalues[i][0] & addf);
401
		if ((((nv + tadd) ^ value) & mask) != 0 ||
402 403
		    (((nv + tadd) ^ cpuhw->avalues[i][0]) &
		     cpuhw->amasks[i][0]) != 0)
404 405
			break;
		value = nv;
406
		mask |= cpuhw->amasks[i][0];
407 408 409 410 411 412 413 414
	}
	if (i == n_ev)
		return 0;	/* all OK */

	/* doesn't work, gather alternatives... */
	if (!ppmu->get_alternatives)
		return -1;
	for (i = 0; i < n_ev; ++i) {
415
		choice[i] = 0;
416
		n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i],
417
						  cpuhw->alternatives[i]);
418
		for (j = 1; j < n_alt[i]; ++j)
419 420 421
			ppmu->get_constraint(cpuhw->alternatives[i][j],
					     &cpuhw->amasks[i][j],
					     &cpuhw->avalues[i][j]);
422 423 424 425 426 427 428 429 430 431 432 433 434 435
	}

	/* enumerate all possibilities and see if any will work */
	i = 0;
	j = -1;
	value = mask = nv = 0;
	while (i < n_ev) {
		if (j >= 0) {
			/* we're backtracking, restore context */
			value = svalues[i];
			mask = smasks[i];
			j = choice[i];
		}
		/*
436
		 * See if any alternative k for event_id i,
437 438 439
		 * where k > j, will satisfy the constraints.
		 */
		while (++j < n_alt[i]) {
440 441
			nv = (value | cpuhw->avalues[i][j]) +
				(value & cpuhw->avalues[i][j] & addf);
442
			if ((((nv + tadd) ^ value) & mask) == 0 &&
443 444
			    (((nv + tadd) ^ cpuhw->avalues[i][j])
			     & cpuhw->amasks[i][j]) == 0)
445 446 447 448 449
				break;
		}
		if (j >= n_alt[i]) {
			/*
			 * No feasible alternative, backtrack
450
			 * to event_id i-1 and continue enumerating its
451 452 453 454 455 456
			 * alternatives from where we got up to.
			 */
			if (--i < 0)
				return -1;
		} else {
			/*
457 458 459
			 * Found a feasible alternative for event_id i,
			 * remember where we got up to with this event_id,
			 * go on to the next event_id, and start with
460 461 462 463 464 465
			 * the first alternative for it.
			 */
			choice[i] = j;
			svalues[i] = value;
			smasks[i] = mask;
			value = nv;
466
			mask |= cpuhw->amasks[i][j];
467 468 469 470 471 472 473
			++i;
			j = -1;
		}
	}

	/* OK, we have a feasible combination, tell the caller the solution */
	for (i = 0; i < n_ev; ++i)
474
		event_id[i] = cpuhw->alternatives[i][choice[i]];
475 476 477
	return 0;
}

478
/*
479
 * Check if newly-added events have consistent settings for
480
 * exclude_{user,kernel,hv} with each other and any previously
481
 * added events.
482
 */
483
static int check_excludes(struct perf_event **ctrs, unsigned int cflags[],
484
			  int n_prev, int n_new)
485
{
486 487
	int eu = 0, ek = 0, eh = 0;
	int i, n, first;
488
	struct perf_event *event;
489 490 491 492 493

	n = n_prev + n_new;
	if (n <= 1)
		return 0;

494 495 496 497 498 499
	first = 1;
	for (i = 0; i < n; ++i) {
		if (cflags[i] & PPMU_LIMITED_PMC_OK) {
			cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
			continue;
		}
500
		event = ctrs[i];
501
		if (first) {
502 503 504
			eu = event->attr.exclude_user;
			ek = event->attr.exclude_kernel;
			eh = event->attr.exclude_hv;
505
			first = 0;
506 507 508
		} else if (event->attr.exclude_user != eu ||
			   event->attr.exclude_kernel != ek ||
			   event->attr.exclude_hv != eh) {
509
			return -EAGAIN;
510
		}
511
	}
512 513 514 515 516 517

	if (eu || ek || eh)
		for (i = 0; i < n; ++i)
			if (cflags[i] & PPMU_LIMITED_PMC_OK)
				cflags[i] |= PPMU_LIMITED_PMC_REQD;

518 519 520
	return 0;
}

521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539
static u64 check_and_compute_delta(u64 prev, u64 val)
{
	u64 delta = (val - prev) & 0xfffffffful;

	/*
	 * POWER7 can roll back counter values, if the new value is smaller
	 * than the previous value it will cause the delta and the counter to
	 * have bogus values unless we rolled a counter over.  If a coutner is
	 * rolled back, it will be smaller, but within 256, which is the maximum
	 * number of events to rollback at once.  If we dectect a rollback
	 * return 0.  This can lead to a small lack of precision in the
	 * counters.
	 */
	if (prev > val && (prev - val) < 256)
		delta = 0;

	return delta;
}

540
static void power_pmu_read(struct perf_event *event)
541
{
542
	s64 val, delta, prev;
543

P
Peter Zijlstra 已提交
544 545 546
	if (event->hw.state & PERF_HES_STOPPED)
		return;

547
	if (!event->hw.idx)
548 549 550 551 552 553 554
		return;
	/*
	 * Performance monitor interrupts come even when interrupts
	 * are soft-disabled, as long as interrupts are hard-enabled.
	 * Therefore we treat them like NMIs.
	 */
	do {
555
		prev = local64_read(&event->hw.prev_count);
556
		barrier();
557
		val = read_pmc(event->hw.idx);
558 559 560
		delta = check_and_compute_delta(prev, val);
		if (!delta)
			return;
561
	} while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
562

563 564
	local64_add(delta, &event->count);
	local64_sub(delta, &event->hw.period_left);
565 566
}

567 568 569
/*
 * On some machines, PMC5 and PMC6 can't be written, don't respect
 * the freeze conditions, and don't generate interrupts.  This tells
570
 * us if `event' is using such a PMC.
571 572 573
 */
static int is_limited_pmc(int pmcnum)
{
574 575
	return (ppmu->flags & PPMU_LIMITED_PMC5_6)
		&& (pmcnum == 5 || pmcnum == 6);
576 577
}

578
static void freeze_limited_counters(struct cpu_hw_events *cpuhw,
579 580
				    unsigned long pmc5, unsigned long pmc6)
{
581
	struct perf_event *event;
582 583 584 585
	u64 val, prev, delta;
	int i;

	for (i = 0; i < cpuhw->n_limited; ++i) {
586
		event = cpuhw->limited_counter[i];
587
		if (!event->hw.idx)
588
			continue;
589
		val = (event->hw.idx == 5) ? pmc5 : pmc6;
590
		prev = local64_read(&event->hw.prev_count);
591
		event->hw.idx = 0;
592 593 594
		delta = check_and_compute_delta(prev, val);
		if (delta)
			local64_add(delta, &event->count);
595 596 597
	}
}

598
static void thaw_limited_counters(struct cpu_hw_events *cpuhw,
599 600
				  unsigned long pmc5, unsigned long pmc6)
{
601
	struct perf_event *event;
602
	u64 val, prev;
603 604 605
	int i;

	for (i = 0; i < cpuhw->n_limited; ++i) {
606
		event = cpuhw->limited_counter[i];
607 608
		event->hw.idx = cpuhw->limited_hwidx[i];
		val = (event->hw.idx == 5) ? pmc5 : pmc6;
609 610 611
		prev = local64_read(&event->hw.prev_count);
		if (check_and_compute_delta(prev, val))
			local64_set(&event->hw.prev_count, val);
612
		perf_event_update_userpage(event);
613 614 615 616
	}
}

/*
617
 * Since limited events don't respect the freeze conditions, we
618
 * have to read them immediately after freezing or unfreezing the
619 620
 * other events.  We try to keep the values from the limited
 * events as consistent as possible by keeping the delay (in
621
 * cycles and instructions) between freezing/unfreezing and reading
622 623
 * the limited events as small and consistent as possible.
 * Therefore, if any limited events are in use, we read them
624 625 626
 * both, and always in the same order, to minimize variability,
 * and do it inside the same asm that writes MMCR0.
 */
627
static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
628 629 630 631 632 633 634 635 636 637
{
	unsigned long pmc5, pmc6;

	if (!cpuhw->n_limited) {
		mtspr(SPRN_MMCR0, mmcr0);
		return;
	}

	/*
	 * Write MMCR0, then read PMC5 and PMC6 immediately.
638 639
	 * To ensure we don't get a performance monitor interrupt
	 * between writing MMCR0 and freezing/thawing the limited
640
	 * events, we first write MMCR0 with the event overflow
641
	 * interrupt enable bits turned off.
642 643 644
	 */
	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
		     : "=&r" (pmc5), "=&r" (pmc6)
645 646
		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
		       "i" (SPRN_MMCR0),
647 648 649
		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));

	if (mmcr0 & MMCR0_FC)
650
		freeze_limited_counters(cpuhw, pmc5, pmc6);
651
	else
652
		thaw_limited_counters(cpuhw, pmc5, pmc6);
653 654

	/*
655
	 * Write the full MMCR0 including the event overflow interrupt
656 657 658 659
	 * enable bits, if necessary.
	 */
	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
		mtspr(SPRN_MMCR0, mmcr0);
660 661
}

662
/*
663 664
 * Disable all events to prevent PMU interrupts and to allow
 * events to be added or removed.
665
 */
P
Peter Zijlstra 已提交
666
static void power_pmu_disable(struct pmu *pmu)
667
{
668
	struct cpu_hw_events *cpuhw;
669 670
	unsigned long flags;

671 672
	if (!ppmu)
		return;
673
	local_irq_save(flags);
674
	cpuhw = &__get_cpu_var(cpu_hw_events);
675

676
	if (!cpuhw->disabled) {
677 678 679
		cpuhw->disabled = 1;
		cpuhw->n_added = 0;

680 681 682 683
		/*
		 * Check if we ever enabled the PMU on this cpu.
		 */
		if (!cpuhw->pmcs_enabled) {
684
			ppc_enable_pmcs();
685 686 687
			cpuhw->pmcs_enabled = 1;
		}

688 689 690 691 692 693 694 695 696
		/*
		 * Disable instruction sampling if it was enabled
		 */
		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
			mtspr(SPRN_MMCRA,
			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
			mb();
		}

697
		/*
I
Ingo Molnar 已提交
698
		 * Set the 'freeze counters' bit.
699
		 * The barrier is to make sure the mtspr has been
700
		 * executed and the PMU has frozen the events
701 702
		 * before we return.
		 */
703
		write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
704 705 706 707 708 709
		mb();
	}
	local_irq_restore(flags);
}

/*
710 711
 * Re-enable all events if disable == 0.
 * If we were previously disabled and events were added, then
712 713
 * put the new config on the PMU.
 */
P
Peter Zijlstra 已提交
714
static void power_pmu_enable(struct pmu *pmu)
715
{
716 717
	struct perf_event *event;
	struct cpu_hw_events *cpuhw;
718 719 720 721
	unsigned long flags;
	long i;
	unsigned long val;
	s64 left;
722
	unsigned int hwc_index[MAX_HWEVENTS];
723 724
	int n_lim;
	int idx;
725

726 727
	if (!ppmu)
		return;
728
	local_irq_save(flags);
729
	cpuhw = &__get_cpu_var(cpu_hw_events);
730 731 732 733
	if (!cpuhw->disabled) {
		local_irq_restore(flags);
		return;
	}
734 735 736
	cpuhw->disabled = 0;

	/*
737
	 * If we didn't change anything, or only removed events,
738 739
	 * no need to recalculate MMCR* settings and reset the PMCs.
	 * Just reenable the PMU with the current MMCR* settings
740
	 * (possibly updated for removal of events).
741 742
	 */
	if (!cpuhw->n_added) {
743
		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
744
		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
745
		if (cpuhw->n_events == 0)
746
			ppc_set_pmu_inuse(0);
747
		goto out_enable;
748 749 750
	}

	/*
751
	 * Compute MMCR* values for the new set of events
752
	 */
753
	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
754 755 756 757 758 759
			       cpuhw->mmcr)) {
		/* shouldn't ever get here */
		printk(KERN_ERR "oops compute_mmcr failed\n");
		goto out;
	}

760 761
	/*
	 * Add in MMCR0 freeze bits corresponding to the
762 763 764
	 * attr.exclude_* bits for the first event.
	 * We have already checked that all events have the
	 * same values for these bits as the first event.
765
	 */
766 767
	event = cpuhw->event[0];
	if (event->attr.exclude_user)
768
		cpuhw->mmcr[0] |= MMCR0_FCP;
769 770 771
	if (event->attr.exclude_kernel)
		cpuhw->mmcr[0] |= freeze_events_kernel;
	if (event->attr.exclude_hv)
772 773
		cpuhw->mmcr[0] |= MMCR0_FCHV;

774 775
	/*
	 * Write the new configuration to MMCR* with the freeze
776 777
	 * bit set and set the hardware events to their initial values.
	 * Then unfreeze the events.
778
	 */
779
	ppc_set_pmu_inuse(1);
780
	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
781 782 783 784 785
	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
				| MMCR0_FC);

	/*
786
	 * Read off any pre-existing events that need to move
787 788
	 * to another PMC.
	 */
789 790 791 792 793 794
	for (i = 0; i < cpuhw->n_events; ++i) {
		event = cpuhw->event[i];
		if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) {
			power_pmu_read(event);
			write_pmc(event->hw.idx, 0);
			event->hw.idx = 0;
795 796 797 798
		}
	}

	/*
799
	 * Initialize the PMCs for all the new and moved events.
800
	 */
801
	cpuhw->n_limited = n_lim = 0;
802 803 804
	for (i = 0; i < cpuhw->n_events; ++i) {
		event = cpuhw->event[i];
		if (event->hw.idx)
805
			continue;
806 807
		idx = hwc_index[i] + 1;
		if (is_limited_pmc(idx)) {
808
			cpuhw->limited_counter[n_lim] = event;
809 810 811 812
			cpuhw->limited_hwidx[n_lim] = idx;
			++n_lim;
			continue;
		}
813
		val = 0;
814
		if (event->hw.sample_period) {
815
			left = local64_read(&event->hw.period_left);
816 817 818
			if (left < 0x80000000L)
				val = 0x80000000L - left;
		}
819
		local64_set(&event->hw.prev_count, val);
820
		event->hw.idx = idx;
P
Peter Zijlstra 已提交
821 822
		if (event->hw.state & PERF_HES_STOPPED)
			val = 0;
823
		write_pmc(idx, val);
824
		perf_event_update_userpage(event);
825
	}
826
	cpuhw->n_limited = n_lim;
827
	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
828 829 830

 out_enable:
	mb();
831
	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
832

833 834 835 836 837 838 839 840
	/*
	 * Enable instruction sampling if necessary
	 */
	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
		mb();
		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
	}

841 842 843 844
 out:
	local_irq_restore(flags);
}

845 846
static int collect_events(struct perf_event *group, int max_count,
			  struct perf_event *ctrs[], u64 *events,
847
			  unsigned int *flags)
848 849
{
	int n = 0;
850
	struct perf_event *event;
851

852
	if (!is_software_event(group)) {
853 854 855
		if (n >= max_count)
			return -1;
		ctrs[n] = group;
856
		flags[n] = group->hw.event_base;
857 858
		events[n++] = group->hw.config;
	}
859
	list_for_each_entry(event, &group->sibling_list, group_entry) {
860 861
		if (!is_software_event(event) &&
		    event->state != PERF_EVENT_STATE_OFF) {
862 863
			if (n >= max_count)
				return -1;
864 865 866
			ctrs[n] = event;
			flags[n] = event->hw.event_base;
			events[n++] = event->hw.config;
867 868 869 870 871 872
		}
	}
	return n;
}

/*
873 874
 * Add a event to the PMU.
 * If all events are not already frozen, then we disable and
875
 * re-enable the PMU in order to get hw_perf_enable to do the
876 877
 * actual work of reconfiguring the PMU.
 */
P
Peter Zijlstra 已提交
878
static int power_pmu_add(struct perf_event *event, int ef_flags)
879
{
880
	struct cpu_hw_events *cpuhw;
881 882 883 884 885
	unsigned long flags;
	int n0;
	int ret = -EAGAIN;

	local_irq_save(flags);
P
Peter Zijlstra 已提交
886
	perf_pmu_disable(event->pmu);
887 888

	/*
889
	 * Add the event to the list (if there is room)
890 891
	 * and check whether the total set is still feasible.
	 */
892 893
	cpuhw = &__get_cpu_var(cpu_hw_events);
	n0 = cpuhw->n_events;
894
	if (n0 >= ppmu->n_counter)
895
		goto out;
896 897 898
	cpuhw->event[n0] = event;
	cpuhw->events[n0] = event->hw.config;
	cpuhw->flags[n0] = event->hw.event_base;
899

900 901 902 903 904 905
	/*
	 * This event may have been disabled/stopped in record_and_restart()
	 * because we exceeded the ->event_limit. If re-starting the event,
	 * clear the ->hw.state (STOPPED and UPTODATE flags), so the user
	 * notification is re-enabled.
	 */
P
Peter Zijlstra 已提交
906 907
	if (!(ef_flags & PERF_EF_START))
		event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
908 909
	else
		event->hw.state = 0;
P
Peter Zijlstra 已提交
910

911 912
	/*
	 * If group events scheduling transaction was started,
L
Lucas De Marchi 已提交
913
	 * skip the schedulability test here, it will be performed
914 915
	 * at commit time(->commit_txn) as a whole
	 */
916
	if (cpuhw->group_flag & PERF_EVENT_TXN)
917 918
		goto nocheck;

919
	if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
920
		goto out;
921
	if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
922
		goto out;
923
	event->hw.config = cpuhw->events[n0];
924 925

nocheck:
926
	++cpuhw->n_events;
927 928 929 930
	++cpuhw->n_added;

	ret = 0;
 out:
P
Peter Zijlstra 已提交
931
	perf_pmu_enable(event->pmu);
932 933 934 935 936
	local_irq_restore(flags);
	return ret;
}

/*
937
 * Remove a event from the PMU.
938
 */
P
Peter Zijlstra 已提交
939
static void power_pmu_del(struct perf_event *event, int ef_flags)
940
{
941
	struct cpu_hw_events *cpuhw;
942 943 944 945
	long i;
	unsigned long flags;

	local_irq_save(flags);
P
Peter Zijlstra 已提交
946
	perf_pmu_disable(event->pmu);
947

948 949 950 951 952
	power_pmu_read(event);

	cpuhw = &__get_cpu_var(cpu_hw_events);
	for (i = 0; i < cpuhw->n_events; ++i) {
		if (event == cpuhw->event[i]) {
953
			while (++i < cpuhw->n_events) {
954
				cpuhw->event[i-1] = cpuhw->event[i];
955 956 957
				cpuhw->events[i-1] = cpuhw->events[i];
				cpuhw->flags[i-1] = cpuhw->flags[i];
			}
958 959 960 961 962
			--cpuhw->n_events;
			ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr);
			if (event->hw.idx) {
				write_pmc(event->hw.idx, 0);
				event->hw.idx = 0;
963
			}
964
			perf_event_update_userpage(event);
965 966 967
			break;
		}
	}
968
	for (i = 0; i < cpuhw->n_limited; ++i)
969
		if (event == cpuhw->limited_counter[i])
970 971 972
			break;
	if (i < cpuhw->n_limited) {
		while (++i < cpuhw->n_limited) {
973
			cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
974 975 976 977
			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
		}
		--cpuhw->n_limited;
	}
978 979
	if (cpuhw->n_events == 0) {
		/* disable exceptions if no events are running */
980 981 982
		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
	}

P
Peter Zijlstra 已提交
983
	perf_pmu_enable(event->pmu);
984 985 986
	local_irq_restore(flags);
}

987
/*
P
Peter Zijlstra 已提交
988 989
 * POWER-PMU does not support disabling individual counters, hence
 * program their cycle counter to their max value and ignore the interrupts.
990
 */
P
Peter Zijlstra 已提交
991 992

static void power_pmu_start(struct perf_event *event, int ef_flags)
993 994
{
	unsigned long flags;
P
Peter Zijlstra 已提交
995
	s64 left;
996
	unsigned long val;
997

998
	if (!event->hw.idx || !event->hw.sample_period)
999
		return;
P
Peter Zijlstra 已提交
1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011

	if (!(event->hw.state & PERF_HES_STOPPED))
		return;

	if (ef_flags & PERF_EF_RELOAD)
		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));

	local_irq_save(flags);
	perf_pmu_disable(event->pmu);

	event->hw.state = 0;
	left = local64_read(&event->hw.period_left);
1012 1013 1014 1015 1016 1017

	val = 0;
	if (left < 0x80000000L)
		val = 0x80000000L - left;

	write_pmc(event->hw.idx, val);
P
Peter Zijlstra 已提交
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033

	perf_event_update_userpage(event);
	perf_pmu_enable(event->pmu);
	local_irq_restore(flags);
}

static void power_pmu_stop(struct perf_event *event, int ef_flags)
{
	unsigned long flags;

	if (!event->hw.idx || !event->hw.sample_period)
		return;

	if (event->hw.state & PERF_HES_STOPPED)
		return;

1034
	local_irq_save(flags);
P
Peter Zijlstra 已提交
1035
	perf_pmu_disable(event->pmu);
P
Peter Zijlstra 已提交
1036

1037
	power_pmu_read(event);
P
Peter Zijlstra 已提交
1038 1039 1040
	event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
	write_pmc(event->hw.idx, 0);

1041
	perf_event_update_userpage(event);
P
Peter Zijlstra 已提交
1042
	perf_pmu_enable(event->pmu);
1043 1044 1045
	local_irq_restore(flags);
}

1046 1047 1048 1049 1050
/*
 * Start group events scheduling transaction
 * Set the flag to make pmu::enable() not perform the
 * schedulability test, it will be performed at commit time
 */
P
Peter Zijlstra 已提交
1051
void power_pmu_start_txn(struct pmu *pmu)
1052 1053 1054
{
	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);

P
Peter Zijlstra 已提交
1055
	perf_pmu_disable(pmu);
1056
	cpuhw->group_flag |= PERF_EVENT_TXN;
1057 1058 1059 1060 1061 1062 1063 1064
	cpuhw->n_txn_start = cpuhw->n_events;
}

/*
 * Stop group events scheduling transaction
 * Clear the flag and pmu::enable() will perform the
 * schedulability test.
 */
P
Peter Zijlstra 已提交
1065
void power_pmu_cancel_txn(struct pmu *pmu)
1066 1067 1068
{
	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);

1069
	cpuhw->group_flag &= ~PERF_EVENT_TXN;
P
Peter Zijlstra 已提交
1070
	perf_pmu_enable(pmu);
1071 1072 1073 1074 1075 1076 1077
}

/*
 * Commit group events scheduling transaction
 * Perform the group schedulability test as a whole
 * Return 0 if success
 */
P
Peter Zijlstra 已提交
1078
int power_pmu_commit_txn(struct pmu *pmu)
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
{
	struct cpu_hw_events *cpuhw;
	long i, n;

	if (!ppmu)
		return -EAGAIN;
	cpuhw = &__get_cpu_var(cpu_hw_events);
	n = cpuhw->n_events;
	if (check_excludes(cpuhw->event, cpuhw->flags, 0, n))
		return -EAGAIN;
	i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n);
	if (i < 0)
		return -EAGAIN;

	for (i = cpuhw->n_txn_start; i < n; ++i)
		cpuhw->event[i]->hw.config = cpuhw->events[i];

1096
	cpuhw->group_flag &= ~PERF_EVENT_TXN;
P
Peter Zijlstra 已提交
1097
	perf_pmu_enable(pmu);
1098 1099 1100
	return 0;
}

1101
/*
1102
 * Return 1 if we might be able to put event on a limited PMC,
1103
 * or 0 if not.
1104
 * A event can only go on a limited PMC if it counts something
1105 1106 1107
 * that a limited PMC can count, doesn't require interrupts, and
 * doesn't exclude any processor mode.
 */
1108
static int can_go_on_limited_pmc(struct perf_event *event, u64 ev,
1109 1110 1111
				 unsigned int flags)
{
	int n;
1112
	u64 alt[MAX_EVENT_ALTERNATIVES];
1113

1114 1115 1116 1117
	if (event->attr.exclude_user
	    || event->attr.exclude_kernel
	    || event->attr.exclude_hv
	    || event->attr.sample_period)
1118 1119 1120 1121 1122 1123
		return 0;

	if (ppmu->limited_pmc_event(ev))
		return 1;

	/*
1124
	 * The requested event_id isn't on a limited PMC already;
1125 1126 1127 1128 1129 1130 1131 1132
	 * see if any alternative code goes on a limited PMC.
	 */
	if (!ppmu->get_alternatives)
		return 0;

	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
	n = ppmu->get_alternatives(ev, flags, alt);

1133
	return n > 0;
1134 1135 1136
}

/*
1137 1138 1139
 * Find an alternative event_id that goes on a normal PMC, if possible,
 * and return the event_id code, or 0 if there is no such alternative.
 * (Note: event_id code 0 is "don't count" on all machines.)
1140
 */
1141
static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
1142
{
1143
	u64 alt[MAX_EVENT_ALTERNATIVES];
1144 1145 1146 1147 1148 1149 1150 1151 1152
	int n;

	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
	n = ppmu->get_alternatives(ev, flags, alt);
	if (!n)
		return 0;
	return alt[0];
}

1153 1154
/* Number of perf_events counting hardware events */
static atomic_t num_events;
1155 1156 1157 1158
/* Used to avoid races in calling reserve/release_pmc_hardware */
static DEFINE_MUTEX(pmc_reserve_mutex);

/*
1159
 * Release the PMU if this is the last perf_event.
1160
 */
1161
static void hw_perf_event_destroy(struct perf_event *event)
1162
{
1163
	if (!atomic_add_unless(&num_events, -1, 1)) {
1164
		mutex_lock(&pmc_reserve_mutex);
1165
		if (atomic_dec_return(&num_events) == 0)
1166 1167 1168 1169 1170
			release_pmc_hardware();
		mutex_unlock(&pmc_reserve_mutex);
	}
}

1171
/*
1172
 * Translate a generic cache event_id config to a raw event_id code.
1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200
 */
static int hw_perf_cache_event(u64 config, u64 *eventp)
{
	unsigned long type, op, result;
	int ev;

	if (!ppmu->cache_events)
		return -EINVAL;

	/* unpack config */
	type = config & 0xff;
	op = (config >> 8) & 0xff;
	result = (config >> 16) & 0xff;

	if (type >= PERF_COUNT_HW_CACHE_MAX ||
	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	ev = (*ppmu->cache_events)[type][op][result];
	if (ev == 0)
		return -EOPNOTSUPP;
	if (ev == -1)
		return -EINVAL;
	*eventp = ev;
	return 0;
}

1201
static int power_pmu_event_init(struct perf_event *event)
1202
{
1203 1204
	u64 ev;
	unsigned long flags;
1205 1206 1207
	struct perf_event *ctrs[MAX_HWEVENTS];
	u64 events[MAX_HWEVENTS];
	unsigned int cflags[MAX_HWEVENTS];
1208
	int n;
1209
	int err;
1210
	struct cpu_hw_events *cpuhw;
1211 1212

	if (!ppmu)
1213 1214
		return -ENOENT;

1215 1216 1217 1218
	/* does not support taken branch sampling */
	if (has_branch_stack(event))
		return -EOPNOTSUPP;

1219
	switch (event->attr.type) {
1220
	case PERF_TYPE_HARDWARE:
1221
		ev = event->attr.config;
1222
		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
1223
			return -EOPNOTSUPP;
1224
		ev = ppmu->generic_events[ev];
1225 1226
		break;
	case PERF_TYPE_HW_CACHE:
1227
		err = hw_perf_cache_event(event->attr.config, &ev);
1228
		if (err)
1229
			return err;
1230 1231
		break;
	case PERF_TYPE_RAW:
1232
		ev = event->attr.config;
1233
		break;
1234
	default:
1235
		return -ENOENT;
1236
	}
1237

1238 1239
	event->hw.config_base = ev;
	event->hw.idx = 0;
1240

1241 1242 1243
	/*
	 * If we are not running on a hypervisor, force the
	 * exclude_hv bit to 0 so that we don't care what
1244
	 * the user set it to.
1245 1246
	 */
	if (!firmware_has_feature(FW_FEATURE_LPAR))
1247
		event->attr.exclude_hv = 0;
1248 1249

	/*
1250
	 * If this is a per-task event, then we can use
1251 1252 1253 1254 1255
	 * PM_RUN_* events interchangeably with their non RUN_*
	 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
	 * XXX we should check if the task is an idle task.
	 */
	flags = 0;
1256
	if (event->attach_state & PERF_ATTACH_TASK)
1257 1258 1259
		flags |= PPMU_ONLY_COUNT_RUN;

	/*
1260 1261
	 * If this machine has limited events, check whether this
	 * event_id could go on a limited event.
1262
	 */
1263
	if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
1264
		if (can_go_on_limited_pmc(event, ev, flags)) {
1265 1266 1267
			flags |= PPMU_LIMITED_PMC_OK;
		} else if (ppmu->limited_pmc_event(ev)) {
			/*
1268
			 * The requested event_id is on a limited PMC,
1269 1270 1271 1272 1273
			 * but we can't use a limited PMC; see if any
			 * alternative goes on a normal PMC.
			 */
			ev = normal_pmc_alternative(ev, flags);
			if (!ev)
1274
				return -EINVAL;
1275 1276 1277
		}
	}

1278 1279
	/*
	 * If this is in a group, check if it can go on with all the
1280
	 * other hardware events in the group.  We assume the event
1281 1282 1283
	 * hasn't been linked into its leader's sibling list at this point.
	 */
	n = 0;
1284
	if (event->group_leader != event) {
1285
		n = collect_events(event->group_leader, ppmu->n_counter - 1,
1286
				   ctrs, events, cflags);
1287
		if (n < 0)
1288
			return -EINVAL;
1289
	}
1290
	events[n] = ev;
1291
	ctrs[n] = event;
1292 1293
	cflags[n] = flags;
	if (check_excludes(ctrs, cflags, n, 1))
1294
		return -EINVAL;
1295

1296
	cpuhw = &get_cpu_var(cpu_hw_events);
1297
	err = power_check_constraints(cpuhw, events, cflags, n + 1);
1298
	put_cpu_var(cpu_hw_events);
1299
	if (err)
1300
		return -EINVAL;
1301

1302 1303 1304
	event->hw.config = events[n];
	event->hw.event_base = cflags[n];
	event->hw.last_period = event->hw.sample_period;
1305
	local64_set(&event->hw.period_left, event->hw.last_period);
1306 1307 1308

	/*
	 * See if we need to reserve the PMU.
1309
	 * If no events are currently in use, then we have to take a
1310 1311 1312 1313
	 * mutex to ensure that we don't race with another task doing
	 * reserve_pmc_hardware or release_pmc_hardware.
	 */
	err = 0;
1314
	if (!atomic_inc_not_zero(&num_events)) {
1315
		mutex_lock(&pmc_reserve_mutex);
1316 1317
		if (atomic_read(&num_events) == 0 &&
		    reserve_pmc_hardware(perf_event_interrupt))
1318 1319
			err = -EBUSY;
		else
1320
			atomic_inc(&num_events);
1321 1322
		mutex_unlock(&pmc_reserve_mutex);
	}
1323
	event->destroy = hw_perf_event_destroy;
1324

1325
	return err;
1326 1327
}

1328 1329 1330 1331 1332
static int power_pmu_event_idx(struct perf_event *event)
{
	return event->hw.idx;
}

1333 1334 1335 1336 1337 1338 1339 1340 1341 1342
ssize_t power_events_sysfs_show(struct device *dev,
				struct device_attribute *attr, char *page)
{
	struct perf_pmu_events_attr *pmu_attr;

	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);

	return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
}

1343
struct pmu power_pmu = {
P
Peter Zijlstra 已提交
1344 1345
	.pmu_enable	= power_pmu_enable,
	.pmu_disable	= power_pmu_disable,
1346
	.event_init	= power_pmu_event_init,
P
Peter Zijlstra 已提交
1347 1348 1349 1350
	.add		= power_pmu_add,
	.del		= power_pmu_del,
	.start		= power_pmu_start,
	.stop		= power_pmu_stop,
1351 1352 1353 1354
	.read		= power_pmu_read,
	.start_txn	= power_pmu_start_txn,
	.cancel_txn	= power_pmu_cancel_txn,
	.commit_txn	= power_pmu_commit_txn,
1355
	.event_idx	= power_pmu_event_idx,
1356 1357
};

1358

1359
/*
I
Ingo Molnar 已提交
1360
 * A counter has overflowed; update its count and record
1361 1362 1363
 * things if requested.  Note that interrupts are hard-disabled
 * here so there is no possibility of being interrupted.
 */
1364
static void record_and_restart(struct perf_event *event, unsigned long val,
1365
			       struct pt_regs *regs)
1366
{
1367
	u64 period = event->hw.sample_period;
1368 1369 1370
	s64 prev, delta, left;
	int record = 0;

P
Peter Zijlstra 已提交
1371 1372 1373 1374 1375
	if (event->hw.state & PERF_HES_STOPPED) {
		write_pmc(event->hw.idx, 0);
		return;
	}

1376
	/* we don't have to worry about interrupts here */
1377
	prev = local64_read(&event->hw.prev_count);
1378
	delta = check_and_compute_delta(prev, val);
1379
	local64_add(delta, &event->count);
1380 1381

	/*
1382
	 * See if the total period for this event has expired,
1383 1384 1385
	 * and update for the next period.
	 */
	val = 0;
1386
	left = local64_read(&event->hw.period_left) - delta;
1387 1388
	if (delta == 0)
		left++;
1389
	if (period) {
1390
		if (left <= 0) {
1391
			left += period;
1392
			if (left <= 0)
1393
				left = period;
1394
			record = siar_valid(regs);
1395
			event->hw.last_period = event->hw.sample_period;
1396
		}
1397 1398
		if (left < 0x80000000LL)
			val = 0x80000000LL - left;
1399 1400
	}

P
Peter Zijlstra 已提交
1401 1402 1403 1404 1405
	write_pmc(event->hw.idx, val);
	local64_set(&event->hw.prev_count, val);
	local64_set(&event->hw.period_left, left);
	perf_event_update_userpage(event);

1406 1407 1408
	/*
	 * Finally record data if requested.
	 */
1409
	if (record) {
1410 1411
		struct perf_sample_data data;

1412
		perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
1413

1414
		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
1415 1416
			perf_get_data_addr(regs, &data.addr);

1417
		if (perf_event_overflow(event, &data, regs))
P
Peter Zijlstra 已提交
1418
			power_pmu_stop(event, 0);
1419 1420 1421 1422 1423
	}
}

/*
 * Called from generic code to get the misc flags (i.e. processor mode)
1424
 * for an event_id.
1425 1426 1427
 */
unsigned long perf_misc_flags(struct pt_regs *regs)
{
1428
	u32 flags = perf_get_misc_flags(regs);
1429

1430 1431
	if (flags)
		return flags;
1432 1433
	return user_mode(regs) ? PERF_RECORD_MISC_USER :
		PERF_RECORD_MISC_KERNEL;
1434 1435 1436 1437
}

/*
 * Called from generic code to get the instruction pointer
1438
 * for an event_id.
1439 1440 1441
 */
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
1442
	bool use_siar = regs_use_siar(regs);
1443

1444
	if (use_siar && siar_valid(regs))
1445
		return mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
1446 1447
	else if (use_siar)
		return 0;		// no valid instruction pointer
1448
	else
1449
		return regs->nip;
1450 1451
}

1452
static bool pmc_overflow_power7(unsigned long val)
1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464
{
	/*
	 * Events on POWER7 can roll back if a speculative event doesn't
	 * eventually complete. Unfortunately in some rare cases they will
	 * raise a performance monitor exception. We need to catch this to
	 * ensure we reset the PMC. In all cases the PMC will be 256 or less
	 * cycles from overflow.
	 *
	 * We only do this if the first pass fails to find any overflowing
	 * PMCs because a user might set a period of less than 256 and we
	 * don't want to mistakenly reset them.
	 */
1465 1466 1467 1468 1469 1470 1471 1472 1473
	if ((0x80000000 - val) <= 256)
		return true;

	return false;
}

static bool pmc_overflow(unsigned long val)
{
	if ((int)val < 0)
1474 1475 1476 1477 1478
		return true;

	return false;
}

1479 1480 1481
/*
 * Performance monitor interrupt stuff
 */
1482
static void perf_event_interrupt(struct pt_regs *regs)
1483
{
1484
	int i, j;
1485 1486
	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
	struct perf_event *event;
1487 1488
	unsigned long val[8];
	int found, active;
1489 1490
	int nmi;

1491
	if (cpuhw->n_limited)
1492
		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
1493 1494
					mfspr(SPRN_PMC6));

1495
	perf_read_regs(regs);
1496

1497
	nmi = perf_intr_is_nmi(regs);
1498 1499 1500 1501
	if (nmi)
		nmi_enter();
	else
		irq_enter();
1502

1503 1504 1505 1506 1507 1508 1509 1510
	/* Read all the PMCs since we'll need them a bunch of times */
	for (i = 0; i < ppmu->n_counter; ++i)
		val[i] = read_pmc(i + 1);

	/* Try to find what caused the IRQ */
	found = 0;
	for (i = 0; i < ppmu->n_counter; ++i) {
		if (!pmc_overflow(val[i]))
1511
			continue;
1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527
		if (is_limited_pmc(i + 1))
			continue; /* these won't generate IRQs */
		/*
		 * We've found one that's overflowed.  For active
		 * counters we need to log this.  For inactive
		 * counters, we need to reset it anyway
		 */
		found = 1;
		active = 0;
		for (j = 0; j < cpuhw->n_events; ++j) {
			event = cpuhw->event[j];
			if (event->hw.idx == (i + 1)) {
				active = 1;
				record_and_restart(event, val[i], regs);
				break;
			}
1528
		}
1529 1530 1531
		if (!active)
			/* reset non active counters that have overflowed */
			write_pmc(i + 1, 0);
1532
	}
1533 1534 1535 1536 1537
	if (!found && pvr_version_is(PVR_POWER7)) {
		/* check active counters for special buggy p7 overflow */
		for (i = 0; i < cpuhw->n_events; ++i) {
			event = cpuhw->event[i];
			if (!event->hw.idx || is_limited_pmc(event->hw.idx))
1538
				continue;
1539 1540 1541 1542 1543 1544 1545
			if (pmc_overflow_power7(val[event->hw.idx - 1])) {
				/* event has overflowed in a buggy way*/
				found = 1;
				record_and_restart(event,
						   val[event->hw.idx - 1],
						   regs);
			}
1546 1547
		}
	}
1548 1549
	if ((!found) && printk_ratelimit())
		printk(KERN_WARNING "Can't find PMC that caused IRQ\n");
1550 1551 1552

	/*
	 * Reset MMCR0 to its normal value.  This will set PMXE and
I
Ingo Molnar 已提交
1553
	 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
1554
	 * and thus allow interrupts to occur again.
1555
	 * XXX might want to use MSR.PM to keep the events frozen until
1556 1557
	 * we get back out of this interrupt.
	 */
1558
	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1559

1560 1561 1562
	if (nmi)
		nmi_exit();
	else
1563
		irq_exit();
1564 1565
}

1566
static void power_pmu_setup(int cpu)
1567
{
1568
	struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
1569

1570 1571
	if (!ppmu)
		return;
1572 1573 1574 1575
	memset(cpuhw, 0, sizeof(*cpuhw));
	cpuhw->mmcr[0] = MMCR0_FC;
}

1576
static int __cpuinit
1577
power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592
{
	unsigned int cpu = (long)hcpu;

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
		power_pmu_setup(cpu);
		break;

	default:
		break;
	}

	return NOTIFY_OK;
}

1593
int __cpuinit register_power_pmu(struct power_pmu *pmu)
1594
{
1595 1596 1597 1598 1599 1600
	if (ppmu)
		return -EBUSY;		/* something's already registered */

	ppmu = pmu;
	pr_info("%s performance monitor hardware support registered\n",
		pmu->name);
1601

1602 1603
	power_pmu.attr_groups = ppmu->attr_groups;

1604
#ifdef MSR_HV
1605 1606 1607 1608
	/*
	 * Use FCHV to ignore kernel events if MSR.HV is set.
	 */
	if (mfmsr() & MSR_HV)
1609
		freeze_events_kernel = MMCR0_FCHV;
1610
#endif /* CONFIG_PPC64 */
1611

P
Peter Zijlstra 已提交
1612
	perf_pmu_register(&power_pmu, "cpu", PERF_TYPE_RAW);
1613 1614
	perf_cpu_notifier(power_pmu_notifier);

1615 1616
	return 0;
}