perf_counter.c 19.4 KB
Newer Older
I
Ingo Molnar 已提交
1 2 3 4 5
/*
 * Performance counter x86 architecture code
 *
 *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6
 *  Copyright(C) 2009 Jaswinder Singh Rajput
I
Ingo Molnar 已提交
7 8 9 10 11 12 13 14 15
 *
 *  For licencing details see kernel-base/COPYING
 */

#include <linux/perf_counter.h>
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
16
#include <linux/module.h>
I
Ingo Molnar 已提交
17 18 19
#include <linux/kdebug.h>
#include <linux/sched.h>

20
#include <asm/perf_counter.h>
I
Ingo Molnar 已提交
21 22 23 24 25 26 27
#include <asm/apic.h>

static bool perf_counters_initialized __read_mostly;

/*
 * Number of (generic) HW counters:
 */
28 29
static int nr_counters_generic __read_mostly;
static u64 perf_counter_mask __read_mostly;
30
static u64 counter_value_mask __read_mostly;
I
Ingo Molnar 已提交
31

32
static int nr_counters_fixed __read_mostly;
33

I
Ingo Molnar 已提交
34
struct cpu_hw_counters {
35 36
	struct perf_counter	*counters[X86_PMC_IDX_MAX];
	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
37
	unsigned long		interrupts;
38
	u64			global_enable;
I
Ingo Molnar 已提交
39 40 41
};

/*
42
 * struct pmc_x86_ops - performance counter x86 ops
I
Ingo Molnar 已提交
43
 */
44 45 46 47 48 49 50 51 52 53 54
struct pmc_x86_ops {
	u64 (*save_disable_all)		(void);
	void (*restore_all)		(u64 ctrl);
	unsigned eventsel;
	unsigned perfctr;
	int (*event_map)		(int event);
	int max_events;
};

static struct pmc_x86_ops *pmc_ops;

I
Ingo Molnar 已提交
55 56
static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);

57 58 59
/*
 * Intel PerfMon v3. Used on Core2 and later.
 */
60
static const int intel_perfmon_event_map[] =
I
Ingo Molnar 已提交
61
{
62
  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
I
Ingo Molnar 已提交
63 64 65 66 67
  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
68
  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
I
Ingo Molnar 已提交
69 70
};

71 72 73 74
static int pmc_intel_event_map(int event)
{
	return intel_perfmon_event_map[event];
}
I
Ingo Molnar 已提交
75

76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
/*
 * Propagate counter elapsed time into the generic counter.
 * Can only be executed on the CPU where the counter is active.
 * Returns the delta events processed.
 */
static void
x86_perf_counter_update(struct perf_counter *counter,
			struct hw_perf_counter *hwc, int idx)
{
	u64 prev_raw_count, new_raw_count, delta;

	/*
	 * Careful: an NMI might modify the previous counter value.
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
	 * count to the generic counter atomically:
	 */
again:
	prev_raw_count = atomic64_read(&hwc->prev_count);
	rdmsrl(hwc->counter_base + idx, new_raw_count);

	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
	 * (counter-)time and add that to the generic counter.
	 *
	 * Careful, not all hw sign-extends above the physical width
	 * of the count, so we do that by clipping the delta to 32 bits:
	 */
	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);

	atomic64_add(delta, &counter->count);
	atomic64_sub(delta, &hwc->period_left);
}

I
Ingo Molnar 已提交
116 117 118
/*
 * Setup the hardware configuration for a given hw_event_type
 */
I
Ingo Molnar 已提交
119
static int __hw_perf_counter_init(struct perf_counter *counter)
I
Ingo Molnar 已提交
120
{
I
Ingo Molnar 已提交
121
	struct perf_counter_hw_event *hw_event = &counter->hw_event;
I
Ingo Molnar 已提交
122 123 124 125 126 127
	struct hw_perf_counter *hwc = &counter->hw;

	if (unlikely(!perf_counters_initialized))
		return -EINVAL;

	/*
128
	 * Generate PMC IRQs:
I
Ingo Molnar 已提交
129 130
	 * (keep 'enabled' bit clear for now)
	 */
131
	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
I
Ingo Molnar 已提交
132 133

	/*
134
	 * Count user and OS events unless requested not to.
I
Ingo Molnar 已提交
135
	 */
136 137 138
	if (!hw_event->exclude_user)
		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!hw_event->exclude_kernel)
I
Ingo Molnar 已提交
139
		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
140 141 142 143 144 145 146

	/*
	 * If privileged enough, allow NMI events:
	 */
	hwc->nmi = 0;
	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
		hwc->nmi = 1;
I
Ingo Molnar 已提交
147

I
Ingo Molnar 已提交
148
	hwc->irq_period		= hw_event->irq_period;
I
Ingo Molnar 已提交
149 150 151 152 153
	/*
	 * Intel PMCs cannot be accessed sanely above 32 bit width,
	 * so we install an artificial 1<<31 period regardless of
	 * the generic counter period:
	 */
154
	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
I
Ingo Molnar 已提交
155 156
		hwc->irq_period = 0x7FFFFFFF;

157
	atomic64_set(&hwc->period_left, hwc->irq_period);
I
Ingo Molnar 已提交
158 159

	/*
160
	 * Raw event type provide the config in the event structure
I
Ingo Molnar 已提交
161
	 */
I
Ingo Molnar 已提交
162 163
	if (hw_event->raw) {
		hwc->config |= hw_event->type;
I
Ingo Molnar 已提交
164
	} else {
165
		if (hw_event->type >= pmc_ops->max_events)
I
Ingo Molnar 已提交
166 167 168 169
			return -EINVAL;
		/*
		 * The generic map:
		 */
170
		hwc->config |= pmc_ops->event_map(hw_event->type);
I
Ingo Molnar 已提交
171 172 173 174 175 176
	}
	counter->wakeup_pending = 0;

	return 0;
}

177
static u64 pmc_intel_save_disable_all(void)
178 179 180 181
{
	u64 ctrl;

	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
182
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
183

184
	return ctrl;
I
Ingo Molnar 已提交
185
}
186 187 188 189 190 191 192 193

u64 hw_perf_save_disable(void)
{
	if (unlikely(!perf_counters_initialized))
		return 0;

	return pmc_ops->save_disable_all();
}
194
EXPORT_SYMBOL_GPL(hw_perf_save_disable);
I
Ingo Molnar 已提交
195

196 197 198 199 200
static void pmc_intel_restore_all(u64 ctrl)
{
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
}

201 202
void hw_perf_restore(u64 ctrl)
{
203 204 205
	if (unlikely(!perf_counters_initialized))
		return;

206
	pmc_ops->restore_all(ctrl);
207 208 209
}
EXPORT_SYMBOL_GPL(hw_perf_restore);

210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
static inline void
__pmc_fixed_disable(struct perf_counter *counter,
		    struct hw_perf_counter *hwc, unsigned int __idx)
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, mask;
	int err;

	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
	err = checking_wrmsrl(hwc->config_base, ctrl_val);
}

225
static inline void
226
__pmc_generic_disable(struct perf_counter *counter,
227
			   struct hw_perf_counter *hwc, unsigned int idx)
228
{
229
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
230 231 232
		__pmc_fixed_disable(counter, hwc, idx);
	else
		wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
233 234
}

235
static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
I
Ingo Molnar 已提交
236

237 238 239 240 241 242 243
/*
 * Set the next IRQ period, based on the hwc->period_left value.
 * To be called with the counter disabled in hw:
 */
static void
__hw_perf_counter_set_period(struct perf_counter *counter,
			     struct hw_perf_counter *hwc, int idx)
I
Ingo Molnar 已提交
244
{
245
	s64 left = atomic64_read(&hwc->period_left);
246
	s32 period = hwc->irq_period;
247
	int err;
248 249 250 251 252 253 254 255 256 257 258 259 260

	/*
	 * If we are way outside a reasoable range then just skip forward:
	 */
	if (unlikely(left <= -period)) {
		left = period;
		atomic64_set(&hwc->period_left, left);
	}

	if (unlikely(left <= 0)) {
		left += period;
		atomic64_set(&hwc->period_left, left);
	}
I
Ingo Molnar 已提交
261

262 263 264 265 266 267
	per_cpu(prev_left[idx], smp_processor_id()) = left;

	/*
	 * The hw counter starts counting from this counter offset,
	 * mark it to be able to extra future deltas:
	 */
268
	atomic64_set(&hwc->prev_count, (u64)-left);
269

270 271 272 273 274 275 276 277 278 279 280 281 282
	err = checking_wrmsrl(hwc->counter_base + idx,
			     (u64)(-left) & counter_value_mask);
}

static inline void
__pmc_fixed_enable(struct perf_counter *counter,
		   struct hw_perf_counter *hwc, unsigned int __idx)
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, bits, mask;
	int err;

	/*
283 284 285
	 * Enable IRQ generation (0x8),
	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
	 * if requested:
286
	 */
287 288 289
	bits = 0x8ULL;
	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
		bits |= 0x2;
290 291 292 293 294 295 296 297 298
	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
		bits |= 0x1;
	bits <<= (idx * 4);
	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
	ctrl_val |= bits;
	err = checking_wrmsrl(hwc->config_base, ctrl_val);
299 300
}

301
static void
302
__pmc_generic_enable(struct perf_counter *counter,
303
			  struct hw_perf_counter *hwc, int idx)
304
{
305
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
306 307 308 309
		__pmc_fixed_enable(counter, hwc, idx);
	else
		wrmsr(hwc->config_base + idx,
		      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
I
Ingo Molnar 已提交
310 311
}

312 313
static int
fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
314
{
315 316 317 318 319 320 321
	unsigned int event;

	if (unlikely(hwc->nmi))
		return -1;

	event = hwc->config & ARCH_PERFMON_EVENT_MASK;

322
	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
323
		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
324
	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
325
		return X86_PMC_IDX_FIXED_CPU_CYCLES;
326
	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
327 328
		return X86_PMC_IDX_FIXED_BUS_CYCLES;

329 330 331
	return -1;
}

332 333 334
/*
 * Find a PMC slot for the freshly enabled / scheduled in counter:
 */
335
static int pmc_generic_enable(struct perf_counter *counter)
I
Ingo Molnar 已提交
336 337 338
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;
339
	int idx;
I
Ingo Molnar 已提交
340

341 342 343 344 345 346 347 348
	idx = fixed_mode_idx(counter, hwc);
	if (idx >= 0) {
		/*
		 * Try to get the fixed counter, if that is already taken
		 * then try to get a generic counter:
		 */
		if (test_and_set_bit(idx, cpuc->used))
			goto try_generic;
349

350 351 352 353 354 355 356
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		/*
		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
		 */
		hwc->counter_base =
			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
357
		hwc->idx = idx;
358 359 360 361 362 363 364 365 366 367 368 369
	} else {
		idx = hwc->idx;
		/* Try to get the previous generic counter again */
		if (test_and_set_bit(idx, cpuc->used)) {
try_generic:
			idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
			if (idx == nr_counters_generic)
				return -EAGAIN;

			set_bit(idx, cpuc->used);
			hwc->idx = idx;
		}
370 371
		hwc->config_base  = pmc_ops->eventsel;
		hwc->counter_base = pmc_ops->perfctr;
I
Ingo Molnar 已提交
372 373 374 375
	}

	perf_counters_lapic_init(hwc->nmi);

376
	__pmc_generic_disable(counter, hwc, idx);
I
Ingo Molnar 已提交
377

378
	cpuc->counters[idx] = counter;
379 380 381 382
	/*
	 * Make it visible before enabling the hw:
	 */
	smp_wmb();
383

384
	__hw_perf_counter_set_period(counter, hwc, idx);
385
	__pmc_generic_enable(counter, hwc, idx);
386 387

	return 0;
I
Ingo Molnar 已提交
388 389 390 391
}

void perf_counter_print_debug(void)
{
392
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
393
	struct cpu_hw_counters *cpuc;
394 395
	int cpu, idx;

396
	if (!nr_counters_generic)
397
		return;
I
Ingo Molnar 已提交
398 399 400 401

	local_irq_disable();

	cpu = smp_processor_id();
402
	cpuc = &per_cpu(cpu_hw_counters, cpu);
I
Ingo Molnar 已提交
403

404 405 406
	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
407
	rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
I
Ingo Molnar 已提交
408 409 410 411 412

	printk(KERN_INFO "\n");
	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
413
	printk(KERN_INFO "CPU#%d: fixed:      %016llx\n", cpu, fixed);
414
	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
I
Ingo Molnar 已提交
415

416
	for (idx = 0; idx < nr_counters_generic; idx++) {
417 418
		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
		rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
I
Ingo Molnar 已提交
419

420
		prev_left = per_cpu(prev_left[idx], cpu);
I
Ingo Molnar 已提交
421

422
		printk(KERN_INFO "CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
423
			cpu, idx, pmc_ctrl);
424
		printk(KERN_INFO "CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
425
			cpu, idx, pmc_count);
426
		printk(KERN_INFO "CPU#%d:   gen-PMC%d left:  %016llx\n",
427
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
428
	}
429 430 431 432 433 434
	for (idx = 0; idx < nr_counters_fixed; idx++) {
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

		printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n",
			cpu, idx, pmc_count);
	}
I
Ingo Molnar 已提交
435 436 437
	local_irq_enable();
}

438
static void pmc_generic_disable(struct perf_counter *counter)
I
Ingo Molnar 已提交
439 440 441 442 443
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;
	unsigned int idx = hwc->idx;

444
	__pmc_generic_disable(counter, hwc, idx);
I
Ingo Molnar 已提交
445 446

	clear_bit(idx, cpuc->used);
447
	cpuc->counters[idx] = NULL;
448 449 450 451 452
	/*
	 * Make sure the cleared pointer becomes visible before we
	 * (potentially) free the counter:
	 */
	smp_wmb();
I
Ingo Molnar 已提交
453

454 455 456 457 458
	/*
	 * Drain the remaining delta count out of a counter
	 * that we are disabling:
	 */
	x86_perf_counter_update(counter, hwc, idx);
I
Ingo Molnar 已提交
459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474
}

static void perf_store_irq_data(struct perf_counter *counter, u64 data)
{
	struct perf_data *irqdata = counter->irqdata;

	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
		irqdata->overrun++;
	} else {
		u64 *p = (u64 *) &irqdata->data[irqdata->len];

		*p = data;
		irqdata->len += sizeof(u64);
	}
}

475
/*
476 477
 * Save and restart an expired counter. Called by NMI contexts,
 * so it has to be careful about preempting normal counter ops:
478
 */
I
Ingo Molnar 已提交
479 480 481 482 483
static void perf_save_and_restart(struct perf_counter *counter)
{
	struct hw_perf_counter *hwc = &counter->hw;
	int idx = hwc->idx;

484 485
	x86_perf_counter_update(counter, hwc, idx);
	__hw_perf_counter_set_period(counter, hwc, idx);
486

487
	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
488
		__pmc_generic_enable(counter, hwc, idx);
I
Ingo Molnar 已提交
489 490 491
}

static void
492
perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
I
Ingo Molnar 已提交
493
{
494
	struct perf_counter *counter, *group_leader = sibling->group_leader;
I
Ingo Molnar 已提交
495

496
	/*
497
	 * Store sibling timestamps (if any):
498 499
	 */
	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
500

501
		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
502
		perf_store_irq_data(sibling, counter->hw_event.type);
503
		perf_store_irq_data(sibling, atomic64_read(&counter->count));
I
Ingo Molnar 已提交
504 505 506
	}
}

507 508 509 510 511
/*
 * Maximum interrupt frequency of 100KHz per CPU
 */
#define PERFMON_MAX_INTERRUPTS 100000/HZ

I
Ingo Molnar 已提交
512 513 514 515 516 517 518
/*
 * This handler is triggered by the local APIC, so the APIC IRQ handling
 * rules apply:
 */
static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
{
	int bit, cpu = smp_processor_id();
519
	u64 ack, status;
520
	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
521

522
	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
I
Ingo Molnar 已提交
523 524

	/* Disable counters globally */
525
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
I
Ingo Molnar 已提交
526 527
	ack_APIC_irq();

528 529 530 531
	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
	if (!status)
		goto out;

I
Ingo Molnar 已提交
532
again:
533
	inc_irq_stat(apic_perf_irqs);
I
Ingo Molnar 已提交
534
	ack = status;
535
	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
536
		struct perf_counter *counter = cpuc->counters[bit];
I
Ingo Molnar 已提交
537 538 539 540 541 542 543

		clear_bit(bit, (unsigned long *) &status);
		if (!counter)
			continue;

		perf_save_and_restart(counter);

I
Ingo Molnar 已提交
544
		switch (counter->hw_event.record_type) {
I
Ingo Molnar 已提交
545 546 547 548 549 550 551 552 553 554 555
		case PERF_RECORD_SIMPLE:
			continue;
		case PERF_RECORD_IRQ:
			perf_store_irq_data(counter, instruction_pointer(regs));
			break;
		case PERF_RECORD_GROUP:
			perf_handle_group(counter, &status, &ack);
			break;
		}
		/*
		 * From NMI context we cannot call into the scheduler to
556
		 * do a task wakeup - but we mark these generic as
I
Ingo Molnar 已提交
557 558 559 560 561 562 563 564 565 566
		 * wakeup_pending and initate a wakeup callback:
		 */
		if (nmi) {
			counter->wakeup_pending = 1;
			set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
		} else {
			wake_up(&counter->waitq);
		}
	}

567
	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
I
Ingo Molnar 已提交
568 569 570 571 572 573 574

	/*
	 * Repeat if there is more work to be done:
	 */
	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
	if (status)
		goto again;
575
out:
I
Ingo Molnar 已提交
576
	/*
577
	 * Restore - do not reenable when global enable is off or throttled:
I
Ingo Molnar 已提交
578
	 */
579
	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
580 581 582 583 584 585
		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
}

void perf_counter_unthrottle(void)
{
	struct cpu_hw_counters *cpuc;
586
	u64 global_enable;
587 588 589 590 591 592 593 594

	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
		return;

	if (unlikely(!perf_counters_initialized))
		return;

	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
595
	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
596
		if (printk_ratelimit())
597
			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
598 599
		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
	}
600 601 602 603
	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable);
	if (unlikely(cpuc->global_enable && !global_enable))
		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
	cpuc->interrupts = 0;
I
Ingo Molnar 已提交
604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627
}

void smp_perf_counter_interrupt(struct pt_regs *regs)
{
	irq_enter();
	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
	__smp_perf_counter_interrupt(regs, 0);

	irq_exit();
}

/*
 * This handler is triggered by NMI contexts:
 */
void perf_counter_notify(struct pt_regs *regs)
{
	struct cpu_hw_counters *cpuc;
	unsigned long flags;
	int bit, cpu;

	local_irq_save(flags);
	cpu = smp_processor_id();
	cpuc = &per_cpu(cpu_hw_counters, cpu);

628 629
	for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
		struct perf_counter *counter = cpuc->counters[bit];
I
Ingo Molnar 已提交
630 631 632 633 634 635 636 637 638 639 640 641 642

		if (!counter)
			continue;

		if (counter->wakeup_pending) {
			counter->wakeup_pending = 0;
			wake_up(&counter->waitq);
		}
	}

	local_irq_restore(flags);
}

643
void perf_counters_lapic_init(int nmi)
I
Ingo Molnar 已提交
644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680
{
	u32 apic_val;

	if (!perf_counters_initialized)
		return;
	/*
	 * Enable the performance counter vector in the APIC LVT:
	 */
	apic_val = apic_read(APIC_LVTERR);

	apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
	if (nmi)
		apic_write(APIC_LVTPC, APIC_DM_NMI);
	else
		apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
	apic_write(APIC_LVTERR, apic_val);
}

static int __kprobes
perf_counter_nmi_handler(struct notifier_block *self,
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
	struct pt_regs *regs;

	if (likely(cmd != DIE_NMI_IPI))
		return NOTIFY_DONE;

	regs = args->regs;

	apic_write(APIC_LVTPC, APIC_DM_NMI);
	__smp_perf_counter_interrupt(regs, 1);

	return NOTIFY_STOP;
}

static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
681 682 683
	.notifier_call		= perf_counter_nmi_handler,
	.next			= NULL,
	.priority		= 1
I
Ingo Molnar 已提交
684 685
};

686 687 688 689 690 691 692 693 694 695
static struct pmc_x86_ops pmc_intel_ops = {
	.save_disable_all	= pmc_intel_save_disable_all,
	.restore_all		= pmc_intel_restore_all,
	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
	.event_map		= pmc_intel_event_map,
	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
};

static struct pmc_x86_ops *pmc_intel_init(void)
I
Ingo Molnar 已提交
696 697 698
{
	union cpuid10_eax eax;
	unsigned int ebx;
699 700
	unsigned int unused;
	union cpuid10_edx edx;
I
Ingo Molnar 已提交
701 702 703 704 705

	/*
	 * Check whether the Architectural PerfMon supports
	 * Branch Misses Retired Event or not.
	 */
706
	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
I
Ingo Molnar 已提交
707
	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
708
		return NULL;
I
Ingo Molnar 已提交
709 710

	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
711
	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
712 713 714
	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);

715
	nr_counters_generic = eax.split.num_counters;
716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735
	nr_counters_fixed = edx.split.num_counters_fixed;
	counter_value_mask = (1ULL << eax.split.bit_width) - 1;

	return &pmc_intel_ops;
}

void __init init_hw_perf_counters(void)
{
	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
		return;

	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
		pmc_ops = pmc_intel_init();
		break;
	}
	if (!pmc_ops)
		return;

	printk(KERN_INFO "... num counters:    %d\n", nr_counters_generic);
736 737
	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
		nr_counters_generic = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
738
		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
739
			nr_counters_generic, X86_PMC_MAX_GENERIC);
I
Ingo Molnar 已提交
740
	}
741 742
	perf_counter_mask = (1 << nr_counters_generic) - 1;
	perf_max_counters = nr_counters_generic;
I
Ingo Molnar 已提交
743

744 745
	printk(KERN_INFO "... value mask:      %016Lx\n", counter_value_mask);

746 747
	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
		nr_counters_fixed = X86_PMC_MAX_FIXED;
748
		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
749
			nr_counters_fixed, X86_PMC_MAX_FIXED);
750
	}
751 752 753
	printk(KERN_INFO "... fixed counters:  %d\n", nr_counters_fixed);

	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
754

755
	printk(KERN_INFO "... counter mask:    %016Lx\n", perf_counter_mask);
756 757
	perf_counters_initialized = true;

I
Ingo Molnar 已提交
758 759 760
	perf_counters_lapic_init(0);
	register_die_notifier(&perf_counter_nmi_notifier);
}
I
Ingo Molnar 已提交
761

762
static void pmc_generic_read(struct perf_counter *counter)
763 764 765 766
{
	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
}

767
static const struct hw_perf_counter_ops x86_perf_counter_ops = {
I
Ingo Molnar 已提交
768 769 770
	.enable		= pmc_generic_enable,
	.disable	= pmc_generic_disable,
	.read		= pmc_generic_read,
I
Ingo Molnar 已提交
771 772
};

773 774
const struct hw_perf_counter_ops *
hw_perf_counter_init(struct perf_counter *counter)
I
Ingo Molnar 已提交
775 776 777 778 779 780 781 782 783
{
	int err;

	err = __hw_perf_counter_init(counter);
	if (err)
		return NULL;

	return &x86_perf_counter_ops;
}