perf_counter.c 21.0 KB
Newer Older
I
Ingo Molnar 已提交
1 2 3 4 5
/*
 * Performance counter x86 architecture code
 *
 *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6
 *  Copyright(C) 2009 Jaswinder Singh Rajput
I
Ingo Molnar 已提交
7 8 9 10 11 12 13 14 15
 *
 *  For licencing details see kernel-base/COPYING
 */

#include <linux/perf_counter.h>
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
16
#include <linux/module.h>
I
Ingo Molnar 已提交
17 18 19
#include <linux/kdebug.h>
#include <linux/sched.h>

20
#include <asm/perf_counter.h>
I
Ingo Molnar 已提交
21 22 23 24 25 26 27
#include <asm/apic.h>

static bool perf_counters_initialized __read_mostly;

/*
 * Number of (generic) HW counters:
 */
28 29
static int nr_counters_generic __read_mostly;
static u64 perf_counter_mask __read_mostly;
30
static u64 counter_value_mask __read_mostly;
I
Ingo Molnar 已提交
31

32
static int nr_counters_fixed __read_mostly;
33

I
Ingo Molnar 已提交
34
struct cpu_hw_counters {
35 36
	struct perf_counter	*counters[X86_PMC_IDX_MAX];
	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
37
	unsigned long		interrupts;
38
	u64			global_enable;
I
Ingo Molnar 已提交
39 40 41
};

/*
42
 * struct pmc_x86_ops - performance counter x86 ops
I
Ingo Molnar 已提交
43
 */
44
struct pmc_x86_ops {
45 46 47 48 49 50
	u64		(*save_disable_all)(void);
	void		(*restore_all)(u64 ctrl);
	unsigned	eventsel;
	unsigned	perfctr;
	int		(*event_map)(int event);
	int		max_events;
51 52 53 54
};

static struct pmc_x86_ops *pmc_ops;

I
Ingo Molnar 已提交
55 56
static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);

57 58 59
/*
 * Intel PerfMon v3. Used on Core2 and later.
 */
60
static const int intel_perfmon_event_map[] =
I
Ingo Molnar 已提交
61
{
62
  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
I
Ingo Molnar 已提交
63 64 65 66 67
  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
68
  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
I
Ingo Molnar 已提交
69 70
};

71 72 73 74
static int pmc_intel_event_map(int event)
{
	return intel_perfmon_event_map[event];
}
I
Ingo Molnar 已提交
75

76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
/*
 * AMD Performance Monitor K7 and later.
 */
static const int amd_perfmon_event_map[] =
{
  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
};

static int pmc_amd_event_map(int event)
{
	return amd_perfmon_event_map[event];
}

94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
/*
 * Propagate counter elapsed time into the generic counter.
 * Can only be executed on the CPU where the counter is active.
 * Returns the delta events processed.
 */
static void
x86_perf_counter_update(struct perf_counter *counter,
			struct hw_perf_counter *hwc, int idx)
{
	u64 prev_raw_count, new_raw_count, delta;

	/*
	 * Careful: an NMI might modify the previous counter value.
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
	 * count to the generic counter atomically:
	 */
again:
	prev_raw_count = atomic64_read(&hwc->prev_count);
	rdmsrl(hwc->counter_base + idx, new_raw_count);

	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
	 * (counter-)time and add that to the generic counter.
	 *
	 * Careful, not all hw sign-extends above the physical width
	 * of the count, so we do that by clipping the delta to 32 bits:
	 */
	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);

	atomic64_add(delta, &counter->count);
	atomic64_sub(delta, &hwc->period_left);
}

I
Ingo Molnar 已提交
134 135 136
/*
 * Setup the hardware configuration for a given hw_event_type
 */
I
Ingo Molnar 已提交
137
static int __hw_perf_counter_init(struct perf_counter *counter)
I
Ingo Molnar 已提交
138
{
I
Ingo Molnar 已提交
139
	struct perf_counter_hw_event *hw_event = &counter->hw_event;
I
Ingo Molnar 已提交
140 141 142 143 144 145
	struct hw_perf_counter *hwc = &counter->hw;

	if (unlikely(!perf_counters_initialized))
		return -EINVAL;

	/*
146
	 * Generate PMC IRQs:
I
Ingo Molnar 已提交
147 148
	 * (keep 'enabled' bit clear for now)
	 */
149
	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
I
Ingo Molnar 已提交
150 151

	/*
152
	 * Count user and OS events unless requested not to.
I
Ingo Molnar 已提交
153
	 */
154 155 156
	if (!hw_event->exclude_user)
		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!hw_event->exclude_kernel)
I
Ingo Molnar 已提交
157
		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
158 159 160 161 162 163 164

	/*
	 * If privileged enough, allow NMI events:
	 */
	hwc->nmi = 0;
	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
		hwc->nmi = 1;
I
Ingo Molnar 已提交
165

I
Ingo Molnar 已提交
166
	hwc->irq_period		= hw_event->irq_period;
I
Ingo Molnar 已提交
167 168 169 170 171
	/*
	 * Intel PMCs cannot be accessed sanely above 32 bit width,
	 * so we install an artificial 1<<31 period regardless of
	 * the generic counter period:
	 */
172 173 174
	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
		if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
			hwc->irq_period = 0x7FFFFFFF;
I
Ingo Molnar 已提交
175

176
	atomic64_set(&hwc->period_left, hwc->irq_period);
I
Ingo Molnar 已提交
177 178

	/*
179
	 * Raw event type provide the config in the event structure
I
Ingo Molnar 已提交
180
	 */
I
Ingo Molnar 已提交
181 182
	if (hw_event->raw) {
		hwc->config |= hw_event->type;
I
Ingo Molnar 已提交
183
	} else {
184
		if (hw_event->type >= pmc_ops->max_events)
I
Ingo Molnar 已提交
185 186 187 188
			return -EINVAL;
		/*
		 * The generic map:
		 */
189
		hwc->config |= pmc_ops->event_map(hw_event->type);
I
Ingo Molnar 已提交
190 191 192 193 194 195
	}
	counter->wakeup_pending = 0;

	return 0;
}

196
static u64 pmc_intel_save_disable_all(void)
197 198 199 200
{
	u64 ctrl;

	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
201
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
202

203
	return ctrl;
I
Ingo Molnar 已提交
204
}
205

206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
static u64 pmc_amd_save_disable_all(void)
{
	int idx;
	u64 val, ctrl = 0;

	for (idx = 0; idx < nr_counters_generic; idx++) {
		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
			ctrl |= (1 << idx);
		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
	}

	return ctrl;
}

222 223 224 225 226 227 228
u64 hw_perf_save_disable(void)
{
	if (unlikely(!perf_counters_initialized))
		return 0;

	return pmc_ops->save_disable_all();
}
229
EXPORT_SYMBOL_GPL(hw_perf_save_disable);
I
Ingo Molnar 已提交
230

231 232 233 234 235
static void pmc_intel_restore_all(u64 ctrl)
{
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
}

236 237 238 239 240 241 242 243 244 245 246 247 248 249
static void pmc_amd_restore_all(u64 ctrl)
{
	u64 val;
	int idx;

	for (idx = 0; idx < nr_counters_generic; idx++) {
		if (ctrl & (1 << idx)) {
			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
			val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
		}
	}
}

250 251
void hw_perf_restore(u64 ctrl)
{
252 253 254
	if (unlikely(!perf_counters_initialized))
		return;

255
	pmc_ops->restore_all(ctrl);
256 257 258
}
EXPORT_SYMBOL_GPL(hw_perf_restore);

259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
static inline void
__pmc_fixed_disable(struct perf_counter *counter,
		    struct hw_perf_counter *hwc, unsigned int __idx)
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, mask;
	int err;

	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
	err = checking_wrmsrl(hwc->config_base, ctrl_val);
}

274
static inline void
275
__pmc_generic_disable(struct perf_counter *counter,
276
			   struct hw_perf_counter *hwc, unsigned int idx)
277
{
278
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
279 280 281
		__pmc_fixed_disable(counter, hwc, idx);
	else
		wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
282 283
}

284
static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
I
Ingo Molnar 已提交
285

286 287 288 289 290 291 292
/*
 * Set the next IRQ period, based on the hwc->period_left value.
 * To be called with the counter disabled in hw:
 */
static void
__hw_perf_counter_set_period(struct perf_counter *counter,
			     struct hw_perf_counter *hwc, int idx)
I
Ingo Molnar 已提交
293
{
294
	s64 left = atomic64_read(&hwc->period_left);
295
	s32 period = hwc->irq_period;
296
	int err;
297 298 299 300 301 302 303 304 305 306 307 308 309

	/*
	 * If we are way outside a reasoable range then just skip forward:
	 */
	if (unlikely(left <= -period)) {
		left = period;
		atomic64_set(&hwc->period_left, left);
	}

	if (unlikely(left <= 0)) {
		left += period;
		atomic64_set(&hwc->period_left, left);
	}
I
Ingo Molnar 已提交
310

311 312 313 314 315 316
	per_cpu(prev_left[idx], smp_processor_id()) = left;

	/*
	 * The hw counter starts counting from this counter offset,
	 * mark it to be able to extra future deltas:
	 */
317
	atomic64_set(&hwc->prev_count, (u64)-left);
318

319 320 321 322 323 324 325 326 327 328 329 330 331
	err = checking_wrmsrl(hwc->counter_base + idx,
			     (u64)(-left) & counter_value_mask);
}

static inline void
__pmc_fixed_enable(struct perf_counter *counter,
		   struct hw_perf_counter *hwc, unsigned int __idx)
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, bits, mask;
	int err;

	/*
332 333 334
	 * Enable IRQ generation (0x8),
	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
	 * if requested:
335
	 */
336 337 338
	bits = 0x8ULL;
	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
		bits |= 0x2;
339 340 341 342 343 344 345 346 347
	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
		bits |= 0x1;
	bits <<= (idx * 4);
	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
	ctrl_val |= bits;
	err = checking_wrmsrl(hwc->config_base, ctrl_val);
348 349
}

350
static void
351
__pmc_generic_enable(struct perf_counter *counter,
352
			  struct hw_perf_counter *hwc, int idx)
353
{
354
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
355 356 357 358
		__pmc_fixed_enable(counter, hwc, idx);
	else
		wrmsr(hwc->config_base + idx,
		      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
I
Ingo Molnar 已提交
359 360
}

361 362
static int
fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
363
{
364 365
	unsigned int event;

366 367 368
	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
		return -1;

369 370 371 372 373
	if (unlikely(hwc->nmi))
		return -1;

	event = hwc->config & ARCH_PERFMON_EVENT_MASK;

374
	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
375
		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
376
	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
377
		return X86_PMC_IDX_FIXED_CPU_CYCLES;
378
	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
379 380
		return X86_PMC_IDX_FIXED_BUS_CYCLES;

381 382 383
	return -1;
}

384 385 386
/*
 * Find a PMC slot for the freshly enabled / scheduled in counter:
 */
387
static int pmc_generic_enable(struct perf_counter *counter)
I
Ingo Molnar 已提交
388 389 390
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;
391
	int idx;
I
Ingo Molnar 已提交
392

393 394 395 396 397 398 399 400
	idx = fixed_mode_idx(counter, hwc);
	if (idx >= 0) {
		/*
		 * Try to get the fixed counter, if that is already taken
		 * then try to get a generic counter:
		 */
		if (test_and_set_bit(idx, cpuc->used))
			goto try_generic;
401

402 403 404 405 406 407 408
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		/*
		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
		 */
		hwc->counter_base =
			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
409
		hwc->idx = idx;
410 411 412 413 414 415 416 417 418 419 420 421
	} else {
		idx = hwc->idx;
		/* Try to get the previous generic counter again */
		if (test_and_set_bit(idx, cpuc->used)) {
try_generic:
			idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
			if (idx == nr_counters_generic)
				return -EAGAIN;

			set_bit(idx, cpuc->used);
			hwc->idx = idx;
		}
422 423
		hwc->config_base  = pmc_ops->eventsel;
		hwc->counter_base = pmc_ops->perfctr;
I
Ingo Molnar 已提交
424 425 426 427
	}

	perf_counters_lapic_init(hwc->nmi);

428
	__pmc_generic_disable(counter, hwc, idx);
I
Ingo Molnar 已提交
429

430
	cpuc->counters[idx] = counter;
431 432 433 434
	/*
	 * Make it visible before enabling the hw:
	 */
	smp_wmb();
435

436
	__hw_perf_counter_set_period(counter, hwc, idx);
437
	__pmc_generic_enable(counter, hwc, idx);
438 439

	return 0;
I
Ingo Molnar 已提交
440 441 442 443
}

void perf_counter_print_debug(void)
{
444
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
445
	struct cpu_hw_counters *cpuc;
446 447
	int cpu, idx;

448
	if (!nr_counters_generic)
449
		return;
I
Ingo Molnar 已提交
450 451 452 453

	local_irq_disable();

	cpu = smp_processor_id();
454
	cpuc = &per_cpu(cpu_hw_counters, cpu);
I
Ingo Molnar 已提交
455

456
	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
457 458 459 460 461 462 463 464 465 466
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
467
	}
468
	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
I
Ingo Molnar 已提交
469

470
	for (idx = 0; idx < nr_counters_generic; idx++) {
471 472
		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
		rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
I
Ingo Molnar 已提交
473

474
		prev_left = per_cpu(prev_left[idx], cpu);
I
Ingo Molnar 已提交
475

476
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
477
			cpu, idx, pmc_ctrl);
478
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
479
			cpu, idx, pmc_count);
480
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
481
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
482
	}
483 484 485
	for (idx = 0; idx < nr_counters_fixed; idx++) {
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

486
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
487 488
			cpu, idx, pmc_count);
	}
I
Ingo Molnar 已提交
489 490 491
	local_irq_enable();
}

492
static void pmc_generic_disable(struct perf_counter *counter)
I
Ingo Molnar 已提交
493 494 495 496 497
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;
	unsigned int idx = hwc->idx;

498
	__pmc_generic_disable(counter, hwc, idx);
I
Ingo Molnar 已提交
499 500

	clear_bit(idx, cpuc->used);
501
	cpuc->counters[idx] = NULL;
502 503 504 505 506
	/*
	 * Make sure the cleared pointer becomes visible before we
	 * (potentially) free the counter:
	 */
	smp_wmb();
I
Ingo Molnar 已提交
507

508 509 510 511 512
	/*
	 * Drain the remaining delta count out of a counter
	 * that we are disabling:
	 */
	x86_perf_counter_update(counter, hwc, idx);
I
Ingo Molnar 已提交
513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528
}

static void perf_store_irq_data(struct perf_counter *counter, u64 data)
{
	struct perf_data *irqdata = counter->irqdata;

	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
		irqdata->overrun++;
	} else {
		u64 *p = (u64 *) &irqdata->data[irqdata->len];

		*p = data;
		irqdata->len += sizeof(u64);
	}
}

529
/*
530 531
 * Save and restart an expired counter. Called by NMI contexts,
 * so it has to be careful about preempting normal counter ops:
532
 */
I
Ingo Molnar 已提交
533 534 535 536 537
static void perf_save_and_restart(struct perf_counter *counter)
{
	struct hw_perf_counter *hwc = &counter->hw;
	int idx = hwc->idx;

538 539
	x86_perf_counter_update(counter, hwc, idx);
	__hw_perf_counter_set_period(counter, hwc, idx);
540

541
	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
542
		__pmc_generic_enable(counter, hwc, idx);
I
Ingo Molnar 已提交
543 544 545
}

static void
546
perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
I
Ingo Molnar 已提交
547
{
548
	struct perf_counter *counter, *group_leader = sibling->group_leader;
I
Ingo Molnar 已提交
549

550
	/*
551
	 * Store sibling timestamps (if any):
552 553
	 */
	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
554

555
		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
556
		perf_store_irq_data(sibling, counter->hw_event.type);
557
		perf_store_irq_data(sibling, atomic64_read(&counter->count));
I
Ingo Molnar 已提交
558 559 560
	}
}

561 562 563
/*
 * Maximum interrupt frequency of 100KHz per CPU
 */
564
#define PERFMON_MAX_INTERRUPTS (100000/HZ)
565

I
Ingo Molnar 已提交
566 567 568 569 570 571 572
/*
 * This handler is triggered by the local APIC, so the APIC IRQ handling
 * rules apply:
 */
static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
{
	int bit, cpu = smp_processor_id();
573
	u64 ack, status;
574
	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
575

576
	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
I
Ingo Molnar 已提交
577 578

	/* Disable counters globally */
579
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
I
Ingo Molnar 已提交
580 581
	ack_APIC_irq();

582 583 584 585
	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
	if (!status)
		goto out;

I
Ingo Molnar 已提交
586
again:
587
	inc_irq_stat(apic_perf_irqs);
I
Ingo Molnar 已提交
588
	ack = status;
589
	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
590
		struct perf_counter *counter = cpuc->counters[bit];
I
Ingo Molnar 已提交
591 592 593 594 595 596 597

		clear_bit(bit, (unsigned long *) &status);
		if (!counter)
			continue;

		perf_save_and_restart(counter);

I
Ingo Molnar 已提交
598
		switch (counter->hw_event.record_type) {
I
Ingo Molnar 已提交
599 600 601 602 603 604 605 606 607 608 609
		case PERF_RECORD_SIMPLE:
			continue;
		case PERF_RECORD_IRQ:
			perf_store_irq_data(counter, instruction_pointer(regs));
			break;
		case PERF_RECORD_GROUP:
			perf_handle_group(counter, &status, &ack);
			break;
		}
		/*
		 * From NMI context we cannot call into the scheduler to
610
		 * do a task wakeup - but we mark these generic as
I
Ingo Molnar 已提交
611 612 613 614 615 616 617 618 619 620
		 * wakeup_pending and initate a wakeup callback:
		 */
		if (nmi) {
			counter->wakeup_pending = 1;
			set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
		} else {
			wake_up(&counter->waitq);
		}
	}

621
	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
I
Ingo Molnar 已提交
622 623 624 625 626 627 628

	/*
	 * Repeat if there is more work to be done:
	 */
	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
	if (status)
		goto again;
629
out:
I
Ingo Molnar 已提交
630
	/*
631
	 * Restore - do not reenable when global enable is off or throttled:
I
Ingo Molnar 已提交
632
	 */
633
	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
634 635 636 637 638 639
		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
}

void perf_counter_unthrottle(void)
{
	struct cpu_hw_counters *cpuc;
640
	u64 global_enable;
641 642 643 644

	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
		return;

645 646 647
	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
		return;

648 649 650 651
	if (unlikely(!perf_counters_initialized))
		return;

	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
652
	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
653
		if (printk_ratelimit())
654
			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
655 656
		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
	}
657 658 659 660
	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable);
	if (unlikely(cpuc->global_enable && !global_enable))
		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
	cpuc->interrupts = 0;
I
Ingo Molnar 已提交
661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684
}

void smp_perf_counter_interrupt(struct pt_regs *regs)
{
	irq_enter();
	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
	__smp_perf_counter_interrupt(regs, 0);

	irq_exit();
}

/*
 * This handler is triggered by NMI contexts:
 */
void perf_counter_notify(struct pt_regs *regs)
{
	struct cpu_hw_counters *cpuc;
	unsigned long flags;
	int bit, cpu;

	local_irq_save(flags);
	cpu = smp_processor_id();
	cpuc = &per_cpu(cpu_hw_counters, cpu);

685 686
	for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
		struct perf_counter *counter = cpuc->counters[bit];
I
Ingo Molnar 已提交
687 688 689 690 691 692 693 694 695 696 697 698 699

		if (!counter)
			continue;

		if (counter->wakeup_pending) {
			counter->wakeup_pending = 0;
			wake_up(&counter->waitq);
		}
	}

	local_irq_restore(flags);
}

700
void perf_counters_lapic_init(int nmi)
I
Ingo Molnar 已提交
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737
{
	u32 apic_val;

	if (!perf_counters_initialized)
		return;
	/*
	 * Enable the performance counter vector in the APIC LVT:
	 */
	apic_val = apic_read(APIC_LVTERR);

	apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
	if (nmi)
		apic_write(APIC_LVTPC, APIC_DM_NMI);
	else
		apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
	apic_write(APIC_LVTERR, apic_val);
}

static int __kprobes
perf_counter_nmi_handler(struct notifier_block *self,
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
	struct pt_regs *regs;

	if (likely(cmd != DIE_NMI_IPI))
		return NOTIFY_DONE;

	regs = args->regs;

	apic_write(APIC_LVTPC, APIC_DM_NMI);
	__smp_perf_counter_interrupt(regs, 1);

	return NOTIFY_STOP;
}

static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
738 739 740
	.notifier_call		= perf_counter_nmi_handler,
	.next			= NULL,
	.priority		= 1
I
Ingo Molnar 已提交
741 742
};

743 744 745 746 747 748 749 750 751
static struct pmc_x86_ops pmc_intel_ops = {
	.save_disable_all	= pmc_intel_save_disable_all,
	.restore_all		= pmc_intel_restore_all,
	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
	.event_map		= pmc_intel_event_map,
	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
};

752 753 754 755 756 757 758 759 760
static struct pmc_x86_ops pmc_amd_ops = {
	.save_disable_all	= pmc_amd_save_disable_all,
	.restore_all		= pmc_amd_restore_all,
	.eventsel		= MSR_K7_EVNTSEL0,
	.perfctr		= MSR_K7_PERFCTR0,
	.event_map		= pmc_amd_event_map,
	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
};

761
static struct pmc_x86_ops *pmc_intel_init(void)
I
Ingo Molnar 已提交
762 763 764
{
	union cpuid10_eax eax;
	unsigned int ebx;
765 766
	unsigned int unused;
	union cpuid10_edx edx;
I
Ingo Molnar 已提交
767 768 769 770 771

	/*
	 * Check whether the Architectural PerfMon supports
	 * Branch Misses Retired Event or not.
	 */
772
	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
I
Ingo Molnar 已提交
773
	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
774
		return NULL;
I
Ingo Molnar 已提交
775

776 777 778 779
	pr_info("Intel Performance Monitoring support detected.\n");
	pr_info("... version:         %d\n", eax.split.version_id);
	pr_info("... bit width:       %d\n", eax.split.bit_width);
	pr_info("... mask length:     %d\n", eax.split.mask_length);
780

781
	nr_counters_generic = eax.split.num_counters;
782 783 784 785 786 787
	nr_counters_fixed = edx.split.num_counters_fixed;
	counter_value_mask = (1ULL << eax.split.bit_width) - 1;

	return &pmc_intel_ops;
}

788 789 790 791 792
static struct pmc_x86_ops *pmc_amd_init(void)
{
	nr_counters_generic = 4;
	nr_counters_fixed = 0;

793
	pr_info("AMD Performance Monitoring support detected.\n");
794 795 796 797

	return &pmc_amd_ops;
}

798 799 800 801 802 803 804 805 806
void __init init_hw_perf_counters(void)
{
	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
		return;

	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
		pmc_ops = pmc_intel_init();
		break;
807 808 809
	case X86_VENDOR_AMD:
		pmc_ops = pmc_amd_init();
		break;
810 811 812 813
	}
	if (!pmc_ops)
		return;

814
	pr_info("... num counters:    %d\n", nr_counters_generic);
815 816
	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
		nr_counters_generic = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
817
		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
818
			nr_counters_generic, X86_PMC_MAX_GENERIC);
I
Ingo Molnar 已提交
819
	}
820 821
	perf_counter_mask = (1 << nr_counters_generic) - 1;
	perf_max_counters = nr_counters_generic;
I
Ingo Molnar 已提交
822

823
	pr_info("... value mask:      %016Lx\n", counter_value_mask);
824

825 826
	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
		nr_counters_fixed = X86_PMC_MAX_FIXED;
827
		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
828
			nr_counters_fixed, X86_PMC_MAX_FIXED);
829
	}
830
	pr_info("... fixed counters:  %d\n", nr_counters_fixed);
831 832

	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
833

834
	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
835 836
	perf_counters_initialized = true;

I
Ingo Molnar 已提交
837 838 839
	perf_counters_lapic_init(0);
	register_die_notifier(&perf_counter_nmi_notifier);
}
I
Ingo Molnar 已提交
840

841
static void pmc_generic_read(struct perf_counter *counter)
842 843 844 845
{
	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
}

846
static const struct hw_perf_counter_ops x86_perf_counter_ops = {
I
Ingo Molnar 已提交
847 848 849
	.enable		= pmc_generic_enable,
	.disable	= pmc_generic_disable,
	.read		= pmc_generic_read,
I
Ingo Molnar 已提交
850 851
};

852 853
const struct hw_perf_counter_ops *
hw_perf_counter_init(struct perf_counter *counter)
I
Ingo Molnar 已提交
854 855 856 857 858 859 860 861 862
{
	int err;

	err = __hw_perf_counter_init(counter);
	if (err)
		return NULL;

	return &x86_perf_counter_ops;
}