perf_event.c 38.7 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2
 * Performance events x86 architecture code
I
Ingo Molnar 已提交
3
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
I
Ingo Molnar 已提交
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
I
Ingo Molnar 已提交
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
#include <linux/module.h>
I
Ingo Molnar 已提交
21 22
#include <linux/kdebug.h>
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/slab.h>
25
#include <linux/cpu.h>
26
#include <linux/bitops.h>
I
Ingo Molnar 已提交
27 28

#include <asm/apic.h>
29
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
30
#include <asm/nmi.h>
31
#include <asm/smp.h>
32
#include <asm/alternative.h>
I
Ingo Molnar 已提交
33

34 35
#include "perf_event.h"

36 37 38 39 40 41 42 43 44 45 46
#if 0
#undef wrmsrl
#define wrmsrl(msr, val) 					\
do {								\
	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
			(unsigned long)(val));			\
	native_write_msr((msr), (u32)((u64)(val)), 		\
			(u32)((u64)(val) >> 32));		\
} while (0)
#endif

47
struct x86_pmu x86_pmu __read_mostly;
48

49
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
50 51
	.enabled = 1,
};
I
Ingo Molnar 已提交
52

53
u64 __read_mostly hw_cache_event_ids
54 55 56
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
57
u64 __read_mostly hw_cache_extra_regs
58 59 60
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
61

62
/*
63 64
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
65 66
 * Returns the delta events processed.
 */
67
u64 x86_perf_event_update(struct perf_event *event)
68
{
69
	struct hw_perf_event *hwc = &event->hw;
70
	int shift = 64 - x86_pmu.cntval_bits;
71
	u64 prev_raw_count, new_raw_count;
72
	int idx = hwc->idx;
73
	s64 delta;
74

75 76 77
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

78
	/*
79
	 * Careful: an NMI might modify the previous event value.
80 81 82
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
83
	 * count to the generic event atomically:
84 85
	 */
again:
86
	prev_raw_count = local64_read(&hwc->prev_count);
87
	rdmsrl(hwc->event_base, new_raw_count);
88

89
	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
90 91 92 93 94 95
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
96
	 * (event-)time and add that to the generic event.
97 98
	 *
	 * Careful, not all hw sign-extends above the physical width
99
	 * of the count.
100
	 */
101 102
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
103

104 105
	local64_add(delta, &event->count);
	local64_sub(delta, &hwc->period_left);
106 107

	return new_raw_count;
108 109
}

110 111 112 113 114
/*
 * Find and validate any extra registers to set up.
 */
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
115
	struct hw_perf_event_extra *reg;
116 117
	struct extra_reg *er;

118
	reg = &event->hw.extra_reg;
119 120 121 122 123 124 125 126 127

	if (!x86_pmu.extra_regs)
		return 0;

	for (er = x86_pmu.extra_regs; er->msr; er++) {
		if (er->event != (config & er->config_mask))
			continue;
		if (event->attr.config1 & ~er->valid_mask)
			return -EINVAL;
128 129 130 131

		reg->idx = er->idx;
		reg->config = event->attr.config1;
		reg->reg = er->msr;
132 133 134 135 136
		break;
	}
	return 0;
}

137
static atomic_t active_events;
P
Peter Zijlstra 已提交
138 139
static DEFINE_MUTEX(pmc_reserve_mutex);

140 141
#ifdef CONFIG_X86_LOCAL_APIC

P
Peter Zijlstra 已提交
142 143 144 145
static bool reserve_pmc_hardware(void)
{
	int i;

146
	for (i = 0; i < x86_pmu.num_counters; i++) {
147
		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
P
Peter Zijlstra 已提交
148 149 150
			goto perfctr_fail;
	}

151
	for (i = 0; i < x86_pmu.num_counters; i++) {
152
		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
P
Peter Zijlstra 已提交
153 154 155 156 157 158 159
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
160
		release_evntsel_nmi(x86_pmu_config_addr(i));
P
Peter Zijlstra 已提交
161

162
	i = x86_pmu.num_counters;
P
Peter Zijlstra 已提交
163 164 165

perfctr_fail:
	for (i--; i >= 0; i--)
166
		release_perfctr_nmi(x86_pmu_event_addr(i));
P
Peter Zijlstra 已提交
167 168 169 170 171 172 173 174

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

175
	for (i = 0; i < x86_pmu.num_counters; i++) {
176 177
		release_perfctr_nmi(x86_pmu_event_addr(i));
		release_evntsel_nmi(x86_pmu_config_addr(i));
P
Peter Zijlstra 已提交
178 179 180
	}
}

181 182 183 184 185 186 187
#else

static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}

#endif

188 189 190
static bool check_hw_exists(void)
{
	u64 val, val_new = 0;
191
	int i, reg, ret = 0;
192

193 194 195 196 197
	/*
	 * Check to see if the BIOS enabled any of the counters, if so
	 * complain and bail.
	 */
	for (i = 0; i < x86_pmu.num_counters; i++) {
198
		reg = x86_pmu_config_addr(i);
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
			goto bios_fail;
	}

	if (x86_pmu.num_counters_fixed) {
		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
			if (val & (0x03 << i*4))
				goto bios_fail;
		}
	}

	/*
	 * Now write a value and read it back to see if it matches,
	 * this is needed to detect certain hardware emulators (qemu/kvm)
	 * that don't trap on the MSR access and always return 0s.
	 */
222
	val = 0xabcdUL;
223 224
	ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
	ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
225
	if (ret || val != val_new)
226
		goto msr_fail;
227 228

	return true;
229 230

bios_fail:
231 232 233 234
	/*
	 * We still allow the PMU driver to operate:
	 */
	printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
235
	printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
236 237

	return true;
238 239 240

msr_fail:
	printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
241

242
	return false;
243 244
}

245
static void hw_perf_event_destroy(struct perf_event *event)
P
Peter Zijlstra 已提交
246
{
247
	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
248
		release_pmc_hardware();
249
		release_ds_buffers();
P
Peter Zijlstra 已提交
250 251 252 253
		mutex_unlock(&pmc_reserve_mutex);
	}
}

254 255 256 257 258
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

259
static inline int
260
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
261
{
262
	struct perf_event_attr *attr = &event->attr;
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;
289 290
	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
	return x86_pmu_extra_regs(val, event);
291 292
}

293
int x86_setup_perfctr(struct perf_event *event)
294 295 296 297 298
{
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
	u64 config;

299
	if (!is_sampling_event(event)) {
300 301
		hwc->sample_period = x86_pmu.max_period;
		hwc->last_period = hwc->sample_period;
302
		local64_set(&hwc->period_left, hwc->sample_period);
303 304 305 306 307 308 309 310 311 312 313 314
	} else {
		/*
		 * If we have a PMU initialized but no APIC
		 * interrupts, we cannot sample hardware
		 * events (user-space has to fall back and
		 * sample via a hrtimer based software event):
		 */
		if (!x86_pmu.apic)
			return -EOPNOTSUPP;
	}

	if (attr->type == PERF_TYPE_RAW)
315
		return x86_pmu_extra_regs(event->attr.config, event);
316 317

	if (attr->type == PERF_TYPE_HW_CACHE)
318
		return set_ext_hw_attr(hwc, event);
319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;

	/*
	 * The generic map:
	 */
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

	/*
	 * Branch tracing:
	 */
P
Peter Zijlstra 已提交
337 338
	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
	    !attr->freq && hwc->sample_period == 1) {
339
		/* BTS is not supported by this architecture. */
340
		if (!x86_pmu.bts_active)
341 342 343 344 345 346 347 348 349 350 351
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (!attr->exclude_kernel)
			return -EOPNOTSUPP;
	}

	hwc->config |= config;

	return 0;
}
352

353
int x86_pmu_hw_config(struct perf_event *event)
354
{
P
Peter Zijlstra 已提交
355 356 357 358
	if (event->attr.precise_ip) {
		int precise = 0;

		/* Support for constant skid */
359
		if (x86_pmu.pebs_active) {
P
Peter Zijlstra 已提交
360 361
			precise++;

362 363 364 365
			/* Support for IP fixup */
			if (x86_pmu.lbr_nr)
				precise++;
		}
P
Peter Zijlstra 已提交
366 367 368 369 370

		if (event->attr.precise_ip > precise)
			return -EOPNOTSUPP;
	}

371 372 373 374
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
375
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
376 377 378 379

	/*
	 * Count user and OS events unless requested not to
	 */
380 381 382 383
	if (!event->attr.exclude_user)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!event->attr.exclude_kernel)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
384

385 386
	if (event->attr.type == PERF_TYPE_RAW)
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
387

388
	return x86_setup_perfctr(event);
389 390
}

I
Ingo Molnar 已提交
391
/*
392
 * Setup the hardware configuration for a given attr_type
I
Ingo Molnar 已提交
393
 */
394
static int __x86_pmu_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
395
{
P
Peter Zijlstra 已提交
396
	int err;
I
Ingo Molnar 已提交
397

398 399
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
400

P
Peter Zijlstra 已提交
401
	err = 0;
402
	if (!atomic_inc_not_zero(&active_events)) {
P
Peter Zijlstra 已提交
403
		mutex_lock(&pmc_reserve_mutex);
404
		if (atomic_read(&active_events) == 0) {
405 406
			if (!reserve_pmc_hardware())
				err = -EBUSY;
407 408
			else
				reserve_ds_buffers();
409 410
		}
		if (!err)
411
			atomic_inc(&active_events);
P
Peter Zijlstra 已提交
412 413 414 415 416
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

417
	event->destroy = hw_perf_event_destroy;
418

419 420 421
	event->hw.idx = -1;
	event->hw.last_cpu = -1;
	event->hw.last_tag = ~0ULL;
422

423 424 425
	/* mark unused */
	event->hw.extra_reg.idx = EXTRA_REG_NONE;

426
	return x86_pmu.hw_config(event);
427 428
}

429
void x86_pmu_disable_all(void)
430
{
431
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
432 433
	int idx;

434
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
435 436
		u64 val;

437
		if (!test_bit(idx, cpuc->active_mask))
438
			continue;
439
		rdmsrl(x86_pmu_config_addr(idx), val);
440
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
441
			continue;
442
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
443
		wrmsrl(x86_pmu_config_addr(idx), val);
444 445 446
	}
}

P
Peter Zijlstra 已提交
447
static void x86_pmu_disable(struct pmu *pmu)
448
{
449 450
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

451
	if (!x86_pmu_initialized())
452
		return;
453

454 455 456 457 458 459
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
460 461

	x86_pmu.disable_all();
462
}
I
Ingo Molnar 已提交
463

464
void x86_pmu_enable_all(int added)
465
{
466
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
467 468
	int idx;

469
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
470
		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
471

472
		if (!test_bit(idx, cpuc->active_mask))
473
			continue;
474

475
		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
476 477 478
	}
}

P
Peter Zijlstra 已提交
479
static struct pmu pmu;
480 481 482 483 484 485

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

486 487 488 489 490 491 492 493 494 495 496 497 498 499 500
/*
 * Event scheduler state:
 *
 * Assign events iterating over all events and counters, beginning
 * with events with least weights first. Keep the current iterator
 * state in struct sched_state.
 */
struct sched_state {
	int	weight;
	int	event;		/* event index */
	int	counter;	/* counter index */
	int	unassigned;	/* number of events to be assigned left */
	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};

501 502 503
/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
#define	SCHED_STATES_MAX	2

504 505 506 507 508
struct perf_sched {
	int			max_weight;
	int			max_events;
	struct event_constraint	**constraints;
	struct sched_state	state;
509 510
	int			saved_states;
	struct sched_state	saved[SCHED_STATES_MAX];
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
};

/*
 * Initialize interator that runs through all events and counters.
 */
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c,
			    int num, int wmin, int wmax)
{
	int idx;

	memset(sched, 0, sizeof(*sched));
	sched->max_events	= num;
	sched->max_weight	= wmax;
	sched->constraints	= c;

	for (idx = 0; idx < num; idx++) {
		if (c[idx]->weight == wmin)
			break;
	}

	sched->state.event	= idx;		/* start with min weight */
	sched->state.weight	= wmin;
	sched->state.unassigned	= num;
}

536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
static void perf_sched_save_state(struct perf_sched *sched)
{
	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
		return;

	sched->saved[sched->saved_states] = sched->state;
	sched->saved_states++;
}

static bool perf_sched_restore_state(struct perf_sched *sched)
{
	if (!sched->saved_states)
		return false;

	sched->saved_states--;
	sched->state = sched->saved[sched->saved_states];

	/* continue with next counter: */
	clear_bit(sched->state.counter++, sched->state.used);

	return true;
}

559 560 561 562
/*
 * Select a counter for the current event to schedule. Return true on
 * success.
 */
563
static bool __perf_sched_find_counter(struct perf_sched *sched)
564 565 566 567 568 569 570 571 572 573 574 575
{
	struct event_constraint *c;
	int idx;

	if (!sched->state.unassigned)
		return false;

	if (sched->state.event >= sched->max_events)
		return false;

	c = sched->constraints[sched->state.event];

576 577 578 579 580 581 582 583
	/* Prefer fixed purpose counters */
	if (x86_pmu.num_counters_fixed) {
		idx = X86_PMC_IDX_FIXED;
		for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) {
			if (!__test_and_set_bit(idx, sched->state.used))
				goto done;
		}
	}
584 585
	/* Grab the first unused counter starting with idx */
	idx = sched->state.counter;
586
	for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
587
		if (!__test_and_set_bit(idx, sched->state.used))
588
			goto done;
589 590
	}

591 592 593 594
	return false;

done:
	sched->state.counter = idx;
595

596 597 598 599 600 601 602 603 604 605 606 607 608
	if (c->overlap)
		perf_sched_save_state(sched);

	return true;
}

static bool perf_sched_find_counter(struct perf_sched *sched)
{
	while (!__perf_sched_find_counter(sched)) {
		if (!perf_sched_restore_state(sched))
			return false;
	}

609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660
	return true;
}

/*
 * Go through all unassigned events and find the next one to schedule.
 * Take events with the least weight first. Return true on success.
 */
static bool perf_sched_next_event(struct perf_sched *sched)
{
	struct event_constraint *c;

	if (!sched->state.unassigned || !--sched->state.unassigned)
		return false;

	do {
		/* next event */
		sched->state.event++;
		if (sched->state.event >= sched->max_events) {
			/* next weight */
			sched->state.event = 0;
			sched->state.weight++;
			if (sched->state.weight > sched->max_weight)
				return false;
		}
		c = sched->constraints[sched->state.event];
	} while (c->weight != sched->state.weight);

	sched->state.counter = 0;	/* start with first counter */

	return true;
}

/*
 * Assign a counter for each event.
 */
static int perf_assign_events(struct event_constraint **constraints, int n,
			      int wmin, int wmax, int *assign)
{
	struct perf_sched sched;

	perf_sched_init(&sched, constraints, n, wmin, wmax);

	do {
		if (!perf_sched_find_counter(&sched))
			break;	/* failed */
		if (assign)
			assign[sched.state.event] = sched.state.counter;
	} while (perf_sched_next_event(&sched));

	return sched.state.unassigned;
}

661
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
662
{
663
	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
664
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
665
	int i, wmin, wmax, num = 0;
666 667 668 669
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

670
	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
671 672
		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
		constraints[i] = c;
673 674
		wmin = min(wmin, c->weight);
		wmax = max(wmax, c->weight);
675 676
	}

677 678 679
	/*
	 * fastpath, try to reuse previous register
	 */
680
	for (i = 0; i < n; i++) {
681
		hwc = &cpuc->event_list[i]->hw;
682
		c = constraints[i];
683 684 685 686 687 688

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
689
		if (!test_bit(hwc->idx, c->idxmsk))
690 691 692 693 694 695
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

P
Peter Zijlstra 已提交
696
		__set_bit(hwc->idx, used_mask);
697 698 699 700
		if (assign)
			assign[i] = hwc->idx;
	}

701 702 703
	/* slow path */
	if (i != n)
		num = perf_assign_events(constraints, n, wmin, wmax, assign);
704

705 706 707 708 709 710 711 712 713 714
	/*
	 * scheduling failed or is just a simulation,
	 * free resources if necessary
	 */
	if (!assign || num) {
		for (i = 0; i < n; i++) {
			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
		}
	}
715
	return num ? -EINVAL : 0;
716 717 718 719 720 721 722 723 724 725 726
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

727
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
728 729 730 731 732 733

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
734
			return -EINVAL;
735 736 737 738 739 740 741 742
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
743
		    event->state <= PERF_EVENT_STATE_OFF)
744 745 746
			continue;

		if (n >= max_count)
747
			return -EINVAL;
748 749 750 751 752 753 754 755

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
756
				struct cpu_hw_events *cpuc, int i)
757
{
758 759 760 761 762
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
763 764 765 766 767 768

	if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
		hwc->config_base = 0;
		hwc->event_base	= 0;
	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
769
		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
770
	} else {
771 772
		hwc->config_base = x86_pmu_config_addr(hwc->idx);
		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
773 774 775
	}
}

776 777 778 779 780 781 782 783 784
static inline int match_prev_assignment(struct hw_perf_event *hwc,
					struct cpu_hw_events *cpuc,
					int i)
{
	return hwc->idx == cpuc->assign[i] &&
		hwc->last_cpu == smp_processor_id() &&
		hwc->last_tag == cpuc->tags[i];
}

P
Peter Zijlstra 已提交
785
static void x86_pmu_start(struct perf_event *event, int flags);
786

P
Peter Zijlstra 已提交
787
static void x86_pmu_enable(struct pmu *pmu)
788
{
789 790 791
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	struct perf_event *event;
	struct hw_perf_event *hwc;
792
	int i, added = cpuc->n_added;
793

794
	if (!x86_pmu_initialized())
795
		return;
796 797 798 799

	if (cpuc->enabled)
		return;

800
	if (cpuc->n_added) {
801
		int n_running = cpuc->n_events - cpuc->n_added;
802 803 804 805 806 807 808
		/*
		 * apply assignment obtained either from
		 * hw_perf_group_sched_in() or x86_pmu_enable()
		 *
		 * step1: save events moving to new counters
		 * step2: reprogram moved events into new counters
		 */
809
		for (i = 0; i < n_running; i++) {
810 811 812
			event = cpuc->event_list[i];
			hwc = &event->hw;

813 814 815 816 817 818 819 820
			/*
			 * we can avoid reprogramming counter if:
			 * - assigned same counter as last time
			 * - running on same CPU as last time
			 * - no other event has used the counter since
			 */
			if (hwc->idx == -1 ||
			    match_prev_assignment(hwc, cpuc, i))
821 822
				continue;

P
Peter Zijlstra 已提交
823 824 825 826 827 828 829 830
			/*
			 * Ensure we don't accidentally enable a stopped
			 * counter simply because we rescheduled.
			 */
			if (hwc->state & PERF_HES_STOPPED)
				hwc->state |= PERF_HES_ARCH;

			x86_pmu_stop(event, PERF_EF_UPDATE);
831 832 833 834 835 836
		}

		for (i = 0; i < cpuc->n_events; i++) {
			event = cpuc->event_list[i];
			hwc = &event->hw;

837
			if (!match_prev_assignment(hwc, cpuc, i))
838
				x86_assign_hw_event(event, cpuc, i);
839 840
			else if (i < n_running)
				continue;
841

P
Peter Zijlstra 已提交
842 843 844 845
			if (hwc->state & PERF_HES_ARCH)
				continue;

			x86_pmu_start(event, PERF_EF_RELOAD);
846 847 848 849
		}
		cpuc->n_added = 0;
		perf_events_lapic_init();
	}
850 851 852 853

	cpuc->enabled = 1;
	barrier();

854
	x86_pmu.enable_all(added);
855 856
}

857
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
I
Ingo Molnar 已提交
858

859 860
/*
 * Set the next IRQ period, based on the hwc->period_left value.
861
 * To be called with the event disabled in hw:
862
 */
863
int x86_perf_event_set_period(struct perf_event *event)
I
Ingo Molnar 已提交
864
{
865
	struct hw_perf_event *hwc = &event->hw;
866
	s64 left = local64_read(&hwc->period_left);
867
	s64 period = hwc->sample_period;
868
	int ret = 0, idx = hwc->idx;
869

870 871 872
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

873
	/*
874
	 * If we are way outside a reasonable range then just skip forward:
875 876 877
	 */
	if (unlikely(left <= -period)) {
		left = period;
878
		local64_set(&hwc->period_left, left);
879
		hwc->last_period = period;
880
		ret = 1;
881 882 883 884
	}

	if (unlikely(left <= 0)) {
		left += period;
885
		local64_set(&hwc->period_left, left);
886
		hwc->last_period = period;
887
		ret = 1;
888
	}
889
	/*
890
	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
891 892 893
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
894

895 896 897
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

898
	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
899 900

	/*
901
	 * The hw event starts counting from this event offset,
902 903
	 * mark it to be able to extra future deltas:
	 */
904
	local64_set(&hwc->prev_count, (u64)-left);
905

906
	wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
907 908 909 910 911 912 913

	/*
	 * Due to erratum on certan cpu we need
	 * a second write to be sure the register
	 * is updated properly
	 */
	if (x86_pmu.perfctr_second_write) {
914
		wrmsrl(hwc->event_base,
915
			(u64)(-left) & x86_pmu.cntval_mask);
916
	}
917

918
	perf_event_update_userpage(event);
919

920
	return ret;
921 922
}

923
void x86_pmu_enable_event(struct perf_event *event)
924
{
T
Tejun Heo 已提交
925
	if (__this_cpu_read(cpu_hw_events.enabled))
926 927
		__x86_pmu_enable_event(&event->hw,
				       ARCH_PERFMON_EVENTSEL_ENABLE);
I
Ingo Molnar 已提交
928 929
}

930
/*
P
Peter Zijlstra 已提交
931
 * Add a single event to the PMU.
932 933 934
 *
 * The event is added to the group of enabled events
 * but only if it can be scehduled with existing events.
935
 */
P
Peter Zijlstra 已提交
936
static int x86_pmu_add(struct perf_event *event, int flags)
937 938
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
939 940 941
	struct hw_perf_event *hwc;
	int assign[X86_PMC_IDX_MAX];
	int n, n0, ret;
942

943
	hwc = &event->hw;
944

P
Peter Zijlstra 已提交
945
	perf_pmu_disable(event->pmu);
946
	n0 = cpuc->n_events;
947 948 949
	ret = n = collect_events(cpuc, event, false);
	if (ret < 0)
		goto out;
950

P
Peter Zijlstra 已提交
951 952 953 954
	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
	if (!(flags & PERF_EF_START))
		hwc->state |= PERF_HES_ARCH;

955 956
	/*
	 * If group events scheduling transaction was started,
L
Lucas De Marchi 已提交
957
	 * skip the schedulability test here, it will be performed
P
Peter Zijlstra 已提交
958
	 * at commit time (->commit_txn) as a whole
959
	 */
960
	if (cpuc->group_flag & PERF_EVENT_TXN)
961
		goto done_collect;
962

963
	ret = x86_pmu.schedule_events(cpuc, n, assign);
964
	if (ret)
965
		goto out;
966 967 968 969 970
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));
971

972
done_collect:
973
	cpuc->n_events = n;
974
	cpuc->n_added += n - n0;
975
	cpuc->n_txn += n - n0;
976

977 978
	ret = 0;
out:
P
Peter Zijlstra 已提交
979
	perf_pmu_enable(event->pmu);
980
	return ret;
I
Ingo Molnar 已提交
981 982
}

P
Peter Zijlstra 已提交
983
static void x86_pmu_start(struct perf_event *event, int flags)
984
{
P
Peter Zijlstra 已提交
985 986 987
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int idx = event->hw.idx;

P
Peter Zijlstra 已提交
988 989 990 991 992 993 994 995 996 997 998 999
	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
		return;

	if (WARN_ON_ONCE(idx == -1))
		return;

	if (flags & PERF_EF_RELOAD) {
		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
		x86_perf_event_set_period(event);
	}

	event->hw.state = 0;
1000

P
Peter Zijlstra 已提交
1001 1002
	cpuc->events[idx] = event;
	__set_bit(idx, cpuc->active_mask);
1003
	__set_bit(idx, cpuc->running);
1004
	x86_pmu.enable(event);
P
Peter Zijlstra 已提交
1005
	perf_event_update_userpage(event);
1006 1007
}

1008
void perf_event_print_debug(void)
I
Ingo Molnar 已提交
1009
{
1010
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1011
	u64 pebs;
1012
	struct cpu_hw_events *cpuc;
1013
	unsigned long flags;
1014 1015
	int cpu, idx;

1016
	if (!x86_pmu.num_counters)
1017
		return;
I
Ingo Molnar 已提交
1018

1019
	local_irq_save(flags);
I
Ingo Molnar 已提交
1020 1021

	cpu = smp_processor_id();
1022
	cpuc = &per_cpu(cpu_hw_events, cpu);
I
Ingo Molnar 已提交
1023

1024
	if (x86_pmu.version >= 2) {
1025 1026 1027 1028
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1029
		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1030 1031 1032 1033 1034 1035

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1036
		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1037
	}
1038
	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
I
Ingo Molnar 已提交
1039

1040
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1041 1042
		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
I
Ingo Molnar 已提交
1043

1044
		prev_left = per_cpu(pmc_prev_left[idx], cpu);
I
Ingo Molnar 已提交
1045

1046
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
1047
			cpu, idx, pmc_ctrl);
1048
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
1049
			cpu, idx, pmc_count);
1050
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1051
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
1052
	}
1053
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1054 1055
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

1056
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1057 1058
			cpu, idx, pmc_count);
	}
1059
	local_irq_restore(flags);
I
Ingo Molnar 已提交
1060 1061
}

1062
void x86_pmu_stop(struct perf_event *event, int flags)
I
Ingo Molnar 已提交
1063
{
1064
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1065
	struct hw_perf_event *hwc = &event->hw;
I
Ingo Molnar 已提交
1066

P
Peter Zijlstra 已提交
1067 1068 1069 1070 1071 1072
	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
		x86_pmu.disable(event);
		cpuc->events[hwc->idx] = NULL;
		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
		hwc->state |= PERF_HES_STOPPED;
	}
1073

P
Peter Zijlstra 已提交
1074 1075 1076 1077 1078 1079 1080 1081
	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
		/*
		 * Drain the remaining delta count out of a event
		 * that we are disabling:
		 */
		x86_perf_event_update(event);
		hwc->state |= PERF_HES_UPTODATE;
	}
1082 1083
}

P
Peter Zijlstra 已提交
1084
static void x86_pmu_del(struct perf_event *event, int flags)
1085 1086 1087 1088
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int i;

1089 1090 1091 1092 1093
	/*
	 * If we're called during a txn, we don't need to do anything.
	 * The events never got scheduled and ->cancel_txn will truncate
	 * the event_list.
	 */
1094
	if (cpuc->group_flag & PERF_EVENT_TXN)
1095 1096
		return;

P
Peter Zijlstra 已提交
1097
	x86_pmu_stop(event, PERF_EF_UPDATE);
1098

1099 1100 1101 1102 1103 1104 1105 1106 1107 1108
	for (i = 0; i < cpuc->n_events; i++) {
		if (event == cpuc->event_list[i]) {

			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, event);

			while (++i < cpuc->n_events)
				cpuc->event_list[i-1] = cpuc->event_list[i];

			--cpuc->n_events;
1109
			break;
1110 1111
		}
	}
1112
	perf_event_update_userpage(event);
I
Ingo Molnar 已提交
1113 1114
}

1115
int x86_pmu_handle_irq(struct pt_regs *regs)
1116
{
1117
	struct perf_sample_data data;
1118 1119
	struct cpu_hw_events *cpuc;
	struct perf_event *event;
V
Vince Weaver 已提交
1120
	int idx, handled = 0;
1121 1122
	u64 val;

1123
	perf_sample_data_init(&data, 0);
1124

1125
	cpuc = &__get_cpu_var(cpu_hw_events);
1126

1127 1128 1129 1130 1131 1132 1133 1134 1135 1136
	/*
	 * Some chipsets need to unmask the LVTPC in a particular spot
	 * inside the nmi handler.  As a result, the unmasking was pushed
	 * into all the nmi handlers.
	 *
	 * This generic handler doesn't seem to have any issues where the
	 * unmasking occurs so it was left at the top.
	 */
	apic_write(APIC_LVTPC, APIC_DM_NMI);

1137
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1138 1139 1140 1141 1142 1143 1144 1145
		if (!test_bit(idx, cpuc->active_mask)) {
			/*
			 * Though we deactivated the counter some cpus
			 * might still deliver spurious interrupts still
			 * in flight. Catch them:
			 */
			if (__test_and_clear_bit(idx, cpuc->running))
				handled++;
1146
			continue;
1147
		}
1148

1149
		event = cpuc->events[idx];
1150

1151
		val = x86_perf_event_update(event);
1152
		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1153
			continue;
1154

1155
		/*
1156
		 * event overflow
1157
		 */
1158
		handled++;
1159
		data.period	= event->hw.last_period;
1160

1161
		if (!x86_perf_event_set_period(event))
1162 1163
			continue;

1164
		if (perf_event_overflow(event, &data, regs))
P
Peter Zijlstra 已提交
1165
			x86_pmu_stop(event, 0);
1166
	}
1167

1168 1169 1170
	if (handled)
		inc_irq_stat(apic_perf_irqs);

1171 1172
	return handled;
}
1173

1174
void perf_events_lapic_init(void)
I
Ingo Molnar 已提交
1175
{
1176
	if (!x86_pmu.apic || !x86_pmu_initialized())
I
Ingo Molnar 已提交
1177
		return;
1178

I
Ingo Molnar 已提交
1179
	/*
1180
	 * Always use NMI for PMU
I
Ingo Molnar 已提交
1181
	 */
1182
	apic_write(APIC_LVTPC, APIC_DM_NMI);
I
Ingo Molnar 已提交
1183 1184 1185
}

static int __kprobes
1186
perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
I
Ingo Molnar 已提交
1187
{
1188
	if (!atomic_read(&active_events))
1189
		return NMI_DONE;
1190

1191
	return x86_pmu.handle_irq(regs);
I
Ingo Molnar 已提交
1192 1193
}

1194 1195
struct event_constraint emptyconstraint;
struct event_constraint unconstrained;
1196

1197 1198 1199 1200
static int __cpuinit
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;
1201
	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1202
	int ret = NOTIFY_OK;
1203 1204 1205

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
1206
		cpuc->kfree_on_online = NULL;
1207
		if (x86_pmu.cpu_prepare)
1208
			ret = x86_pmu.cpu_prepare(cpu);
1209 1210 1211 1212 1213 1214 1215
		break;

	case CPU_STARTING:
		if (x86_pmu.cpu_starting)
			x86_pmu.cpu_starting(cpu);
		break;

1216 1217 1218 1219
	case CPU_ONLINE:
		kfree(cpuc->kfree_on_online);
		break;

1220 1221 1222 1223 1224
	case CPU_DYING:
		if (x86_pmu.cpu_dying)
			x86_pmu.cpu_dying(cpu);
		break;

1225
	case CPU_UP_CANCELED:
1226 1227 1228 1229 1230 1231 1232 1233 1234
	case CPU_DEAD:
		if (x86_pmu.cpu_dead)
			x86_pmu.cpu_dead(cpu);
		break;

	default:
		break;
	}

1235
	return ret;
1236 1237
}

1238 1239 1240 1241 1242 1243 1244 1245 1246 1247
static void __init pmu_check_apic(void)
{
	if (cpu_has_apic)
		return;

	x86_pmu.apic = 0;
	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
	pr_info("no hardware sampling interrupt available.\n");
}

1248
static int __init init_hw_perf_events(void)
1249
{
1250
	struct x86_pmu_quirk *quirk;
1251
	struct event_constraint *c;
1252 1253
	int err;

1254
	pr_info("Performance Events: ");
1255

1256 1257
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1258
		err = intel_pmu_init();
1259
		break;
1260
	case X86_VENDOR_AMD:
1261
		err = amd_pmu_init();
1262
		break;
1263
	default:
1264
		return 0;
1265
	}
1266
	if (err != 0) {
1267
		pr_cont("no PMU driver, software events only.\n");
1268
		return 0;
1269
	}
1270

1271 1272
	pmu_check_apic();

1273
	/* sanity check that the hardware exists or is emulated */
1274
	if (!check_hw_exists())
1275
		return 0;
1276

1277
	pr_cont("%s PMU driver.\n", x86_pmu.name);
1278

1279 1280
	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
		quirk->func();
1281

1282
	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1283
		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1284 1285
		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
1286
	}
1287
	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
I
Ingo Molnar 已提交
1288

1289
	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1290
		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1291 1292
		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1293
	}
1294

1295
	x86_pmu.intel_ctrl |=
1296
		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
1297

1298
	perf_events_lapic_init();
1299
	register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
1300

1301
	unconstrained = (struct event_constraint)
1302
		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1303
				   0, x86_pmu.num_counters, 0);
1304

1305
	if (x86_pmu.event_constraints) {
1306 1307 1308 1309
		/*
		 * event on fixed counter2 (REF_CYCLES) only works on this
		 * counter, so do not extend mask to generic counters
		 */
1310
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1311 1312
			if (c->cmask != X86_RAW_EVENT_MASK
			    || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {
1313
				continue;
1314
			}
1315

1316 1317
			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
			c->weight += x86_pmu.num_counters;
1318 1319 1320
		}
	}

I
Ingo Molnar 已提交
1321
	pr_info("... version:                %d\n",     x86_pmu.version);
1322 1323 1324
	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
I
Ingo Molnar 已提交
1325
	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1326
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1327
	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1328

P
Peter Zijlstra 已提交
1329
	perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1330
	perf_cpu_notifier(x86_pmu_notifier);
1331 1332

	return 0;
I
Ingo Molnar 已提交
1333
}
1334
early_initcall(init_hw_perf_events);
I
Ingo Molnar 已提交
1335

1336
static inline void x86_pmu_read(struct perf_event *event)
1337
{
1338
	x86_perf_event_update(event);
1339 1340
}

1341 1342 1343 1344 1345
/*
 * Start group events scheduling transaction
 * Set the flag to make pmu::enable() not perform the
 * schedulability test, it will be performed at commit time
 */
P
Peter Zijlstra 已提交
1346
static void x86_pmu_start_txn(struct pmu *pmu)
1347
{
P
Peter Zijlstra 已提交
1348
	perf_pmu_disable(pmu);
T
Tejun Heo 已提交
1349 1350
	__this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
	__this_cpu_write(cpu_hw_events.n_txn, 0);
1351 1352 1353 1354 1355 1356 1357
}

/*
 * Stop group events scheduling transaction
 * Clear the flag and pmu::enable() will perform the
 * schedulability test.
 */
P
Peter Zijlstra 已提交
1358
static void x86_pmu_cancel_txn(struct pmu *pmu)
1359
{
T
Tejun Heo 已提交
1360
	__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1361 1362 1363
	/*
	 * Truncate the collected events.
	 */
T
Tejun Heo 已提交
1364 1365
	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
P
Peter Zijlstra 已提交
1366
	perf_pmu_enable(pmu);
1367 1368 1369 1370 1371 1372 1373
}

/*
 * Commit group events scheduling transaction
 * Perform the group schedulability test as a whole
 * Return 0 if success
 */
P
Peter Zijlstra 已提交
1374
static int x86_pmu_commit_txn(struct pmu *pmu)
1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int assign[X86_PMC_IDX_MAX];
	int n, ret;

	n = cpuc->n_events;

	if (!x86_pmu_initialized())
		return -EAGAIN;

	ret = x86_pmu.schedule_events(cpuc, n, assign);
	if (ret)
		return ret;

	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));

1395
	cpuc->group_flag &= ~PERF_EVENT_TXN;
P
Peter Zijlstra 已提交
1396
	perf_pmu_enable(pmu);
1397 1398
	return 0;
}
1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432
/*
 * a fake_cpuc is used to validate event groups. Due to
 * the extra reg logic, we need to also allocate a fake
 * per_core and per_cpu structure. Otherwise, group events
 * using extra reg may conflict without the kernel being
 * able to catch this when the last event gets added to
 * the group.
 */
static void free_fake_cpuc(struct cpu_hw_events *cpuc)
{
	kfree(cpuc->shared_regs);
	kfree(cpuc);
}

static struct cpu_hw_events *allocate_fake_cpuc(void)
{
	struct cpu_hw_events *cpuc;
	int cpu = raw_smp_processor_id();

	cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
	if (!cpuc)
		return ERR_PTR(-ENOMEM);

	/* only needed, if we have extra_regs */
	if (x86_pmu.extra_regs) {
		cpuc->shared_regs = allocate_shared_regs(cpu);
		if (!cpuc->shared_regs)
			goto error;
	}
	return cpuc;
error:
	free_fake_cpuc(cpuc);
	return ERR_PTR(-ENOMEM);
}
1433

1434 1435 1436 1437 1438 1439 1440 1441 1442
/*
 * validate that we can schedule this event
 */
static int validate_event(struct perf_event *event)
{
	struct cpu_hw_events *fake_cpuc;
	struct event_constraint *c;
	int ret = 0;

1443 1444 1445
	fake_cpuc = allocate_fake_cpuc();
	if (IS_ERR(fake_cpuc))
		return PTR_ERR(fake_cpuc);
1446 1447 1448 1449

	c = x86_pmu.get_event_constraints(fake_cpuc, event);

	if (!c || !c->weight)
1450
		ret = -EINVAL;
1451 1452 1453 1454

	if (x86_pmu.put_event_constraints)
		x86_pmu.put_event_constraints(fake_cpuc, event);

1455
	free_fake_cpuc(fake_cpuc);
1456 1457 1458 1459

	return ret;
}

1460 1461 1462 1463
/*
 * validate a single event group
 *
 * validation include:
1464 1465 1466
 *	- check events are compatible which each other
 *	- events do not compete for the same counter
 *	- number of events <= number of counters
1467 1468 1469 1470
 *
 * validation ensures the group can be loaded onto the
 * PMU if it was the only group available.
 */
1471 1472
static int validate_group(struct perf_event *event)
{
1473
	struct perf_event *leader = event->group_leader;
1474
	struct cpu_hw_events *fake_cpuc;
1475
	int ret = -EINVAL, n;
1476

1477 1478 1479
	fake_cpuc = allocate_fake_cpuc();
	if (IS_ERR(fake_cpuc))
		return PTR_ERR(fake_cpuc);
1480 1481 1482 1483 1484 1485
	/*
	 * the event is not yet connected with its
	 * siblings therefore we must first collect
	 * existing siblings, then add the new event
	 * before we can simulate the scheduling
	 */
1486
	n = collect_events(fake_cpuc, leader, true);
1487
	if (n < 0)
1488
		goto out;
1489

1490 1491
	fake_cpuc->n_events = n;
	n = collect_events(fake_cpuc, event, false);
1492
	if (n < 0)
1493
		goto out;
1494

1495
	fake_cpuc->n_events = n;
1496

1497
	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1498 1499

out:
1500
	free_fake_cpuc(fake_cpuc);
1501
	return ret;
1502 1503
}

1504
static int x86_pmu_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
1505
{
P
Peter Zijlstra 已提交
1506
	struct pmu *tmp;
I
Ingo Molnar 已提交
1507 1508
	int err;

1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519
	switch (event->attr.type) {
	case PERF_TYPE_RAW:
	case PERF_TYPE_HARDWARE:
	case PERF_TYPE_HW_CACHE:
		break;

	default:
		return -ENOENT;
	}

	err = __x86_pmu_event_init(event);
1520
	if (!err) {
1521 1522 1523 1524 1525 1526 1527 1528
		/*
		 * we temporarily connect event to its pmu
		 * such that validate_group() can classify
		 * it as an x86 event using is_x86_event()
		 */
		tmp = event->pmu;
		event->pmu = &pmu;

1529 1530
		if (event->group_leader != event)
			err = validate_group(event);
1531 1532
		else
			err = validate_event(event);
1533 1534

		event->pmu = tmp;
1535
	}
1536
	if (err) {
1537 1538
		if (event->destroy)
			event->destroy(event);
1539
	}
I
Ingo Molnar 已提交
1540

1541
	return err;
I
Ingo Molnar 已提交
1542
}
1543

1544
static struct pmu pmu = {
P
Peter Zijlstra 已提交
1545 1546 1547
	.pmu_enable	= x86_pmu_enable,
	.pmu_disable	= x86_pmu_disable,

1548
	.event_init	= x86_pmu_event_init,
P
Peter Zijlstra 已提交
1549 1550 1551

	.add		= x86_pmu_add,
	.del		= x86_pmu_del,
1552 1553 1554
	.start		= x86_pmu_start,
	.stop		= x86_pmu_stop,
	.read		= x86_pmu_read,
P
Peter Zijlstra 已提交
1555

1556 1557 1558 1559 1560
	.start_txn	= x86_pmu_start_txn,
	.cancel_txn	= x86_pmu_cancel_txn,
	.commit_txn	= x86_pmu_commit_txn,
};

1561 1562 1563 1564 1565 1566
/*
 * callchain support
 */

static int backtrace_stack(void *data, char *name)
{
1567
	return 0;
1568 1569 1570 1571 1572 1573
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

1574
	perf_callchain_store(entry, addr);
1575 1576 1577 1578 1579
}

static const struct stacktrace_ops backtrace_ops = {
	.stack			= backtrace_stack,
	.address		= backtrace_address,
1580
	.walk_stack		= print_context_stack_bp,
1581 1582
};

1583 1584
void
perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1585
{
1586 1587
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
1588
		return;
1589 1590
	}

1591
	perf_callchain_store(entry, regs->ip);
1592

1593
	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1594 1595
}

1596
#ifdef CONFIG_COMPAT
H
H. Peter Anvin 已提交
1597 1598 1599

#include <asm/compat.h>

1600 1601
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1602
{
1603 1604 1605
	/* 32-bit process in 64-bit kernel. */
	struct stack_frame_ia32 frame;
	const void __user *fp;
1606

1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618
	if (!test_thread_flag(TIF_IA32))
		return 0;

	fp = compat_ptr(regs->bp);
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
		unsigned long bytes;
		frame.next_frame     = 0;
		frame.return_address = 0;

		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
			break;
1619

1620 1621
		if (fp < compat_ptr(regs->sp))
			break;
1622

1623
		perf_callchain_store(entry, frame.return_address);
1624 1625 1626
		fp = compat_ptr(frame.next_frame);
	}
	return 1;
1627
}
1628 1629 1630 1631 1632 1633 1634
#else
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
    return 0;
}
#endif
1635

1636 1637
void
perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1638 1639 1640 1641
{
	struct stack_frame frame;
	const void __user *fp;

1642 1643
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
1644
		return;
1645
	}
1646

1647
	fp = (void __user *)regs->bp;
1648

1649
	perf_callchain_store(entry, regs->ip);
1650

1651 1652 1653
	if (!current->mm)
		return;

1654 1655 1656
	if (perf_callchain_user32(regs, entry))
		return;

1657
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
1658
		unsigned long bytes;
1659
		frame.next_frame	     = NULL;
1660 1661
		frame.return_address = 0;

1662 1663
		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
1664 1665
			break;

1666
		if ((unsigned long)fp < regs->sp)
1667 1668
			break;

1669
		perf_callchain_store(entry, frame.return_address);
1670
		fp = frame.next_frame;
1671 1672 1673
	}
}

1674 1675 1676
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
	unsigned long ip;
1677

1678 1679 1680 1681
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
		ip = perf_guest_cbs->get_guest_ip();
	else
		ip = instruction_pointer(regs);
1682

1683 1684 1685 1686 1687 1688
	return ip;
}

unsigned long perf_misc_flags(struct pt_regs *regs)
{
	int misc = 0;
1689

1690
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701
		if (perf_guest_cbs->is_user_mode())
			misc |= PERF_RECORD_MISC_GUEST_USER;
		else
			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
	} else {
		if (user_mode(regs))
			misc |= PERF_RECORD_MISC_USER;
		else
			misc |= PERF_RECORD_MISC_KERNEL;
	}

1702
	if (regs->flags & PERF_EFLAGS_EXACT)
P
Peter Zijlstra 已提交
1703
		misc |= PERF_RECORD_MISC_EXACT_IP;
1704 1705 1706

	return misc;
}
1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718

void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
{
	cap->version		= x86_pmu.version;
	cap->num_counters_gp	= x86_pmu.num_counters;
	cap->num_counters_fixed	= x86_pmu.num_counters_fixed;
	cap->bit_width_gp	= x86_pmu.cntval_bits;
	cap->bit_width_fixed	= x86_pmu.cntval_bits;
	cap->events_mask	= (unsigned int)x86_pmu.events_maskl;
	cap->events_mask_len	= x86_pmu.events_mask_len;
}
EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);