perf_event.c 42.4 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2
 * Performance events x86 architecture code
I
Ingo Molnar 已提交
3
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
I
Ingo Molnar 已提交
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
I
Ingo Molnar 已提交
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
#include <linux/module.h>
I
Ingo Molnar 已提交
21 22
#include <linux/kdebug.h>
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/slab.h>
25
#include <linux/cpu.h>
26
#include <linux/bitops.h>
27
#include <linux/device.h>
I
Ingo Molnar 已提交
28 29

#include <asm/apic.h>
30
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
31
#include <asm/nmi.h>
32
#include <asm/smp.h>
33
#include <asm/alternative.h>
34
#include <asm/timer.h>
I
Ingo Molnar 已提交
35

36 37 38
#include "perf_event.h"

struct x86_pmu x86_pmu __read_mostly;
39

40
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
41 42
	.enabled = 1,
};
I
Ingo Molnar 已提交
43

44
u64 __read_mostly hw_cache_event_ids
45 46 47
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
48
u64 __read_mostly hw_cache_extra_regs
49 50 51
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
52

53
/*
54 55
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
56 57
 * Returns the delta events processed.
 */
58
u64 x86_perf_event_update(struct perf_event *event)
59
{
60
	struct hw_perf_event *hwc = &event->hw;
61
	int shift = 64 - x86_pmu.cntval_bits;
62
	u64 prev_raw_count, new_raw_count;
63
	int idx = hwc->idx;
64
	s64 delta;
65

66 67 68
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

69
	/*
70
	 * Careful: an NMI might modify the previous event value.
71 72 73
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
74
	 * count to the generic event atomically:
75 76
	 */
again:
77
	prev_raw_count = local64_read(&hwc->prev_count);
78
	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
79

80
	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
81 82 83 84 85 86
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
87
	 * (event-)time and add that to the generic event.
88 89
	 *
	 * Careful, not all hw sign-extends above the physical width
90
	 * of the count.
91
	 */
92 93
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
94

95 96
	local64_add(delta, &event->count);
	local64_sub(delta, &hwc->period_left);
97 98

	return new_raw_count;
99 100
}

101 102 103 104 105
/*
 * Find and validate any extra registers to set up.
 */
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
106
	struct hw_perf_event_extra *reg;
107 108
	struct extra_reg *er;

109
	reg = &event->hw.extra_reg;
110 111 112 113 114 115 116 117 118

	if (!x86_pmu.extra_regs)
		return 0;

	for (er = x86_pmu.extra_regs; er->msr; er++) {
		if (er->event != (config & er->config_mask))
			continue;
		if (event->attr.config1 & ~er->valid_mask)
			return -EINVAL;
119 120 121 122

		reg->idx = er->idx;
		reg->config = event->attr.config1;
		reg->reg = er->msr;
123 124 125 126 127
		break;
	}
	return 0;
}

128
static atomic_t active_events;
P
Peter Zijlstra 已提交
129 130
static DEFINE_MUTEX(pmc_reserve_mutex);

131 132
#ifdef CONFIG_X86_LOCAL_APIC

P
Peter Zijlstra 已提交
133 134 135 136
static bool reserve_pmc_hardware(void)
{
	int i;

137
	for (i = 0; i < x86_pmu.num_counters; i++) {
138
		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
P
Peter Zijlstra 已提交
139 140 141
			goto perfctr_fail;
	}

142
	for (i = 0; i < x86_pmu.num_counters; i++) {
143
		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
P
Peter Zijlstra 已提交
144 145 146 147 148 149 150
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
151
		release_evntsel_nmi(x86_pmu_config_addr(i));
P
Peter Zijlstra 已提交
152

153
	i = x86_pmu.num_counters;
P
Peter Zijlstra 已提交
154 155 156

perfctr_fail:
	for (i--; i >= 0; i--)
157
		release_perfctr_nmi(x86_pmu_event_addr(i));
P
Peter Zijlstra 已提交
158 159 160 161 162 163 164 165

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

166
	for (i = 0; i < x86_pmu.num_counters; i++) {
167 168
		release_perfctr_nmi(x86_pmu_event_addr(i));
		release_evntsel_nmi(x86_pmu_config_addr(i));
P
Peter Zijlstra 已提交
169 170 171
	}
}

172 173 174 175 176 177 178
#else

static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}

#endif

179 180 181
static bool check_hw_exists(void)
{
	u64 val, val_new = 0;
182
	int i, reg, ret = 0;
183

184 185 186 187 188
	/*
	 * Check to see if the BIOS enabled any of the counters, if so
	 * complain and bail.
	 */
	for (i = 0; i < x86_pmu.num_counters; i++) {
189
		reg = x86_pmu_config_addr(i);
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
			goto bios_fail;
	}

	if (x86_pmu.num_counters_fixed) {
		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
			if (val & (0x03 << i*4))
				goto bios_fail;
		}
	}

	/*
	 * Now write a value and read it back to see if it matches,
	 * this is needed to detect certain hardware emulators (qemu/kvm)
	 * that don't trap on the MSR access and always return 0s.
	 */
213
	val = 0xabcdUL;
214 215
	ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
	ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
216
	if (ret || val != val_new)
217
		goto msr_fail;
218 219

	return true;
220 221

bios_fail:
222 223 224 225
	/*
	 * We still allow the PMU driver to operate:
	 */
	printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
226
	printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
227 228

	return true;
229 230 231

msr_fail:
	printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
232

233
	return false;
234 235
}

236
static void hw_perf_event_destroy(struct perf_event *event)
P
Peter Zijlstra 已提交
237
{
238
	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
239
		release_pmc_hardware();
240
		release_ds_buffers();
P
Peter Zijlstra 已提交
241 242 243 244
		mutex_unlock(&pmc_reserve_mutex);
	}
}

245 246 247 248 249
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

250
static inline int
251
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
252
{
253
	struct perf_event_attr *attr = &event->attr;
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;
280 281
	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
	return x86_pmu_extra_regs(val, event);
282 283
}

284
int x86_setup_perfctr(struct perf_event *event)
285 286 287 288 289
{
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
	u64 config;

290
	if (!is_sampling_event(event)) {
291 292
		hwc->sample_period = x86_pmu.max_period;
		hwc->last_period = hwc->sample_period;
293
		local64_set(&hwc->period_left, hwc->sample_period);
294 295 296 297 298 299 300 301 302 303 304 305
	} else {
		/*
		 * If we have a PMU initialized but no APIC
		 * interrupts, we cannot sample hardware
		 * events (user-space has to fall back and
		 * sample via a hrtimer based software event):
		 */
		if (!x86_pmu.apic)
			return -EOPNOTSUPP;
	}

	if (attr->type == PERF_TYPE_RAW)
306
		return x86_pmu_extra_regs(event->attr.config, event);
307 308

	if (attr->type == PERF_TYPE_HW_CACHE)
309
		return set_ext_hw_attr(hwc, event);
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;

	/*
	 * The generic map:
	 */
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

	/*
	 * Branch tracing:
	 */
P
Peter Zijlstra 已提交
328 329
	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
	    !attr->freq && hwc->sample_period == 1) {
330
		/* BTS is not supported by this architecture. */
331
		if (!x86_pmu.bts_active)
332 333 334 335 336 337 338 339 340 341 342
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (!attr->exclude_kernel)
			return -EOPNOTSUPP;
	}

	hwc->config |= config;

	return 0;
}
343

344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
/*
 * check that branch_sample_type is compatible with
 * settings needed for precise_ip > 1 which implies
 * using the LBR to capture ALL taken branches at the
 * priv levels of the measurement
 */
static inline int precise_br_compat(struct perf_event *event)
{
	u64 m = event->attr.branch_sample_type;
	u64 b = 0;

	/* must capture all branches */
	if (!(m & PERF_SAMPLE_BRANCH_ANY))
		return 0;

	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_user)
		b |= PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_kernel)
		b |= PERF_SAMPLE_BRANCH_KERNEL;

	/*
	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
	 */

	return m == b;
}

374
int x86_pmu_hw_config(struct perf_event *event)
375
{
P
Peter Zijlstra 已提交
376 377 378 379
	if (event->attr.precise_ip) {
		int precise = 0;

		/* Support for constant skid */
380
		if (x86_pmu.pebs_active) {
P
Peter Zijlstra 已提交
381 382
			precise++;

383 384 385 386
			/* Support for IP fixup */
			if (x86_pmu.lbr_nr)
				precise++;
		}
P
Peter Zijlstra 已提交
387 388 389

		if (event->attr.precise_ip > precise)
			return -EOPNOTSUPP;
390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
		/*
		 * check that PEBS LBR correction does not conflict with
		 * whatever the user is asking with attr->branch_sample_type
		 */
		if (event->attr.precise_ip > 1) {
			u64 *br_type = &event->attr.branch_sample_type;

			if (has_branch_stack(event)) {
				if (!precise_br_compat(event))
					return -EOPNOTSUPP;

				/* branch_sample_type is compatible */

			} else {
				/*
				 * user did not specify  branch_sample_type
				 *
				 * For PEBS fixups, we capture all
				 * the branches at the priv level of the
				 * event.
				 */
				*br_type = PERF_SAMPLE_BRANCH_ANY;

				if (!event->attr.exclude_user)
					*br_type |= PERF_SAMPLE_BRANCH_USER;

				if (!event->attr.exclude_kernel)
					*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
			}
		}
P
Peter Zijlstra 已提交
420 421
	}

422 423 424 425
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
426
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
427 428 429 430

	/*
	 * Count user and OS events unless requested not to
	 */
431 432 433 434
	if (!event->attr.exclude_user)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!event->attr.exclude_kernel)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
435

436 437
	if (event->attr.type == PERF_TYPE_RAW)
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
438

439
	return x86_setup_perfctr(event);
440 441
}

I
Ingo Molnar 已提交
442
/*
443
 * Setup the hardware configuration for a given attr_type
I
Ingo Molnar 已提交
444
 */
445
static int __x86_pmu_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
446
{
P
Peter Zijlstra 已提交
447
	int err;
I
Ingo Molnar 已提交
448

449 450
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
451

P
Peter Zijlstra 已提交
452
	err = 0;
453
	if (!atomic_inc_not_zero(&active_events)) {
P
Peter Zijlstra 已提交
454
		mutex_lock(&pmc_reserve_mutex);
455
		if (atomic_read(&active_events) == 0) {
456 457
			if (!reserve_pmc_hardware())
				err = -EBUSY;
458 459
			else
				reserve_ds_buffers();
460 461
		}
		if (!err)
462
			atomic_inc(&active_events);
P
Peter Zijlstra 已提交
463 464 465 466 467
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

468
	event->destroy = hw_perf_event_destroy;
469

470 471 472
	event->hw.idx = -1;
	event->hw.last_cpu = -1;
	event->hw.last_tag = ~0ULL;
473

474 475
	/* mark unused */
	event->hw.extra_reg.idx = EXTRA_REG_NONE;
476 477
	event->hw.branch_reg.idx = EXTRA_REG_NONE;

478
	return x86_pmu.hw_config(event);
479 480
}

481
void x86_pmu_disable_all(void)
482
{
483
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
484 485
	int idx;

486
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
487 488
		u64 val;

489
		if (!test_bit(idx, cpuc->active_mask))
490
			continue;
491
		rdmsrl(x86_pmu_config_addr(idx), val);
492
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
493
			continue;
494
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
495
		wrmsrl(x86_pmu_config_addr(idx), val);
496 497 498
	}
}

P
Peter Zijlstra 已提交
499
static void x86_pmu_disable(struct pmu *pmu)
500
{
501 502
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

503
	if (!x86_pmu_initialized())
504
		return;
505

506 507 508 509 510 511
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
512 513

	x86_pmu.disable_all();
514
}
I
Ingo Molnar 已提交
515

516
void x86_pmu_enable_all(int added)
517
{
518
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
519 520
	int idx;

521
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
522
		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
523

524
		if (!test_bit(idx, cpuc->active_mask))
525
			continue;
526

527
		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
528 529 530
	}
}

P
Peter Zijlstra 已提交
531
static struct pmu pmu;
532 533 534 535 536 537

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

538 539 540 541 542 543 544 545 546 547 548 549 550 551 552
/*
 * Event scheduler state:
 *
 * Assign events iterating over all events and counters, beginning
 * with events with least weights first. Keep the current iterator
 * state in struct sched_state.
 */
struct sched_state {
	int	weight;
	int	event;		/* event index */
	int	counter;	/* counter index */
	int	unassigned;	/* number of events to be assigned left */
	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};

553 554 555
/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
#define	SCHED_STATES_MAX	2

556 557 558 559 560
struct perf_sched {
	int			max_weight;
	int			max_events;
	struct event_constraint	**constraints;
	struct sched_state	state;
561 562
	int			saved_states;
	struct sched_state	saved[SCHED_STATES_MAX];
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
};

/*
 * Initialize interator that runs through all events and counters.
 */
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c,
			    int num, int wmin, int wmax)
{
	int idx;

	memset(sched, 0, sizeof(*sched));
	sched->max_events	= num;
	sched->max_weight	= wmax;
	sched->constraints	= c;

	for (idx = 0; idx < num; idx++) {
		if (c[idx]->weight == wmin)
			break;
	}

	sched->state.event	= idx;		/* start with min weight */
	sched->state.weight	= wmin;
	sched->state.unassigned	= num;
}

588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610
static void perf_sched_save_state(struct perf_sched *sched)
{
	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
		return;

	sched->saved[sched->saved_states] = sched->state;
	sched->saved_states++;
}

static bool perf_sched_restore_state(struct perf_sched *sched)
{
	if (!sched->saved_states)
		return false;

	sched->saved_states--;
	sched->state = sched->saved[sched->saved_states];

	/* continue with next counter: */
	clear_bit(sched->state.counter++, sched->state.used);

	return true;
}

611 612 613 614
/*
 * Select a counter for the current event to schedule. Return true on
 * success.
 */
615
static bool __perf_sched_find_counter(struct perf_sched *sched)
616 617 618 619 620 621 622 623 624 625 626 627
{
	struct event_constraint *c;
	int idx;

	if (!sched->state.unassigned)
		return false;

	if (sched->state.event >= sched->max_events)
		return false;

	c = sched->constraints[sched->state.event];

628 629 630
	/* Prefer fixed purpose counters */
	if (x86_pmu.num_counters_fixed) {
		idx = X86_PMC_IDX_FIXED;
631
		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
632 633 634 635
			if (!__test_and_set_bit(idx, sched->state.used))
				goto done;
		}
	}
636 637
	/* Grab the first unused counter starting with idx */
	idx = sched->state.counter;
638
	for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
639
		if (!__test_and_set_bit(idx, sched->state.used))
640
			goto done;
641 642
	}

643 644 645 646
	return false;

done:
	sched->state.counter = idx;
647

648 649 650 651 652 653 654 655 656 657 658 659 660
	if (c->overlap)
		perf_sched_save_state(sched);

	return true;
}

static bool perf_sched_find_counter(struct perf_sched *sched)
{
	while (!__perf_sched_find_counter(sched)) {
		if (!perf_sched_restore_state(sched))
			return false;
	}

661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712
	return true;
}

/*
 * Go through all unassigned events and find the next one to schedule.
 * Take events with the least weight first. Return true on success.
 */
static bool perf_sched_next_event(struct perf_sched *sched)
{
	struct event_constraint *c;

	if (!sched->state.unassigned || !--sched->state.unassigned)
		return false;

	do {
		/* next event */
		sched->state.event++;
		if (sched->state.event >= sched->max_events) {
			/* next weight */
			sched->state.event = 0;
			sched->state.weight++;
			if (sched->state.weight > sched->max_weight)
				return false;
		}
		c = sched->constraints[sched->state.event];
	} while (c->weight != sched->state.weight);

	sched->state.counter = 0;	/* start with first counter */

	return true;
}

/*
 * Assign a counter for each event.
 */
static int perf_assign_events(struct event_constraint **constraints, int n,
			      int wmin, int wmax, int *assign)
{
	struct perf_sched sched;

	perf_sched_init(&sched, constraints, n, wmin, wmax);

	do {
		if (!perf_sched_find_counter(&sched))
			break;	/* failed */
		if (assign)
			assign[sched.state.event] = sched.state.counter;
	} while (perf_sched_next_event(&sched));

	return sched.state.unassigned;
}

713
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
714
{
715
	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
716
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
717
	int i, wmin, wmax, num = 0;
718 719 720 721
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

722
	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
723 724
		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
		constraints[i] = c;
725 726
		wmin = min(wmin, c->weight);
		wmax = max(wmax, c->weight);
727 728
	}

729 730 731
	/*
	 * fastpath, try to reuse previous register
	 */
732
	for (i = 0; i < n; i++) {
733
		hwc = &cpuc->event_list[i]->hw;
734
		c = constraints[i];
735 736 737 738 739 740

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
741
		if (!test_bit(hwc->idx, c->idxmsk))
742 743 744 745 746 747
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

P
Peter Zijlstra 已提交
748
		__set_bit(hwc->idx, used_mask);
749 750 751 752
		if (assign)
			assign[i] = hwc->idx;
	}

753 754 755
	/* slow path */
	if (i != n)
		num = perf_assign_events(constraints, n, wmin, wmax, assign);
756

757 758 759 760 761 762 763 764 765 766
	/*
	 * scheduling failed or is just a simulation,
	 * free resources if necessary
	 */
	if (!assign || num) {
		for (i = 0; i < n; i++) {
			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
		}
	}
767
	return num ? -EINVAL : 0;
768 769 770 771 772 773 774 775 776 777 778
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

779
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
780 781 782 783 784 785

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
786
			return -EINVAL;
787 788 789 790 791 792 793 794
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
795
		    event->state <= PERF_EVENT_STATE_OFF)
796 797 798
			continue;

		if (n >= max_count)
799
			return -EINVAL;
800 801 802 803 804 805 806 807

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
808
				struct cpu_hw_events *cpuc, int i)
809
{
810 811 812 813 814
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
815 816 817 818 819 820

	if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
		hwc->config_base = 0;
		hwc->event_base	= 0;
	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
821
		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
822
		hwc->event_base_rdpmc = (hwc->idx - X86_PMC_IDX_FIXED) | 1<<30;
823
	} else {
824 825
		hwc->config_base = x86_pmu_config_addr(hwc->idx);
		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
826
		hwc->event_base_rdpmc = x86_pmu_addr_offset(hwc->idx);
827 828 829
	}
}

830 831 832 833 834 835 836 837 838
static inline int match_prev_assignment(struct hw_perf_event *hwc,
					struct cpu_hw_events *cpuc,
					int i)
{
	return hwc->idx == cpuc->assign[i] &&
		hwc->last_cpu == smp_processor_id() &&
		hwc->last_tag == cpuc->tags[i];
}

P
Peter Zijlstra 已提交
839
static void x86_pmu_start(struct perf_event *event, int flags);
840

P
Peter Zijlstra 已提交
841
static void x86_pmu_enable(struct pmu *pmu)
842
{
843 844 845
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	struct perf_event *event;
	struct hw_perf_event *hwc;
846
	int i, added = cpuc->n_added;
847

848
	if (!x86_pmu_initialized())
849
		return;
850 851 852 853

	if (cpuc->enabled)
		return;

854
	if (cpuc->n_added) {
855
		int n_running = cpuc->n_events - cpuc->n_added;
856 857 858 859 860 861 862
		/*
		 * apply assignment obtained either from
		 * hw_perf_group_sched_in() or x86_pmu_enable()
		 *
		 * step1: save events moving to new counters
		 * step2: reprogram moved events into new counters
		 */
863
		for (i = 0; i < n_running; i++) {
864 865 866
			event = cpuc->event_list[i];
			hwc = &event->hw;

867 868 869 870 871 872 873 874
			/*
			 * we can avoid reprogramming counter if:
			 * - assigned same counter as last time
			 * - running on same CPU as last time
			 * - no other event has used the counter since
			 */
			if (hwc->idx == -1 ||
			    match_prev_assignment(hwc, cpuc, i))
875 876
				continue;

P
Peter Zijlstra 已提交
877 878 879 880 881 882 883 884
			/*
			 * Ensure we don't accidentally enable a stopped
			 * counter simply because we rescheduled.
			 */
			if (hwc->state & PERF_HES_STOPPED)
				hwc->state |= PERF_HES_ARCH;

			x86_pmu_stop(event, PERF_EF_UPDATE);
885 886 887 888 889 890
		}

		for (i = 0; i < cpuc->n_events; i++) {
			event = cpuc->event_list[i];
			hwc = &event->hw;

891
			if (!match_prev_assignment(hwc, cpuc, i))
892
				x86_assign_hw_event(event, cpuc, i);
893 894
			else if (i < n_running)
				continue;
895

P
Peter Zijlstra 已提交
896 897 898 899
			if (hwc->state & PERF_HES_ARCH)
				continue;

			x86_pmu_start(event, PERF_EF_RELOAD);
900 901 902 903
		}
		cpuc->n_added = 0;
		perf_events_lapic_init();
	}
904 905 906 907

	cpuc->enabled = 1;
	barrier();

908
	x86_pmu.enable_all(added);
909 910
}

911
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
I
Ingo Molnar 已提交
912

913 914
/*
 * Set the next IRQ period, based on the hwc->period_left value.
915
 * To be called with the event disabled in hw:
916
 */
917
int x86_perf_event_set_period(struct perf_event *event)
I
Ingo Molnar 已提交
918
{
919
	struct hw_perf_event *hwc = &event->hw;
920
	s64 left = local64_read(&hwc->period_left);
921
	s64 period = hwc->sample_period;
922
	int ret = 0, idx = hwc->idx;
923

924 925 926
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

927
	/*
928
	 * If we are way outside a reasonable range then just skip forward:
929 930 931
	 */
	if (unlikely(left <= -period)) {
		left = period;
932
		local64_set(&hwc->period_left, left);
933
		hwc->last_period = period;
934
		ret = 1;
935 936 937 938
	}

	if (unlikely(left <= 0)) {
		left += period;
939
		local64_set(&hwc->period_left, left);
940
		hwc->last_period = period;
941
		ret = 1;
942
	}
943
	/*
944
	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
945 946 947
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
948

949 950 951
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

952
	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
953 954

	/*
955
	 * The hw event starts counting from this event offset,
956 957
	 * mark it to be able to extra future deltas:
	 */
958
	local64_set(&hwc->prev_count, (u64)-left);
959

960
	wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
961 962 963 964 965 966 967

	/*
	 * Due to erratum on certan cpu we need
	 * a second write to be sure the register
	 * is updated properly
	 */
	if (x86_pmu.perfctr_second_write) {
968
		wrmsrl(hwc->event_base,
969
			(u64)(-left) & x86_pmu.cntval_mask);
970
	}
971

972
	perf_event_update_userpage(event);
973

974
	return ret;
975 976
}

977
void x86_pmu_enable_event(struct perf_event *event)
978
{
T
Tejun Heo 已提交
979
	if (__this_cpu_read(cpu_hw_events.enabled))
980 981
		__x86_pmu_enable_event(&event->hw,
				       ARCH_PERFMON_EVENTSEL_ENABLE);
I
Ingo Molnar 已提交
982 983
}

984
/*
P
Peter Zijlstra 已提交
985
 * Add a single event to the PMU.
986 987 988
 *
 * The event is added to the group of enabled events
 * but only if it can be scehduled with existing events.
989
 */
P
Peter Zijlstra 已提交
990
static int x86_pmu_add(struct perf_event *event, int flags)
991 992
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
993 994 995
	struct hw_perf_event *hwc;
	int assign[X86_PMC_IDX_MAX];
	int n, n0, ret;
996

997
	hwc = &event->hw;
998

P
Peter Zijlstra 已提交
999
	perf_pmu_disable(event->pmu);
1000
	n0 = cpuc->n_events;
1001 1002 1003
	ret = n = collect_events(cpuc, event, false);
	if (ret < 0)
		goto out;
1004

P
Peter Zijlstra 已提交
1005 1006 1007 1008
	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
	if (!(flags & PERF_EF_START))
		hwc->state |= PERF_HES_ARCH;

1009 1010
	/*
	 * If group events scheduling transaction was started,
L
Lucas De Marchi 已提交
1011
	 * skip the schedulability test here, it will be performed
P
Peter Zijlstra 已提交
1012
	 * at commit time (->commit_txn) as a whole
1013
	 */
1014
	if (cpuc->group_flag & PERF_EVENT_TXN)
1015
		goto done_collect;
1016

1017
	ret = x86_pmu.schedule_events(cpuc, n, assign);
1018
	if (ret)
1019
		goto out;
1020 1021 1022 1023 1024
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));
1025

1026
done_collect:
1027
	cpuc->n_events = n;
1028
	cpuc->n_added += n - n0;
1029
	cpuc->n_txn += n - n0;
1030

1031 1032
	ret = 0;
out:
P
Peter Zijlstra 已提交
1033
	perf_pmu_enable(event->pmu);
1034
	return ret;
I
Ingo Molnar 已提交
1035 1036
}

P
Peter Zijlstra 已提交
1037
static void x86_pmu_start(struct perf_event *event, int flags)
1038
{
P
Peter Zijlstra 已提交
1039 1040 1041
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int idx = event->hw.idx;

P
Peter Zijlstra 已提交
1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053
	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
		return;

	if (WARN_ON_ONCE(idx == -1))
		return;

	if (flags & PERF_EF_RELOAD) {
		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
		x86_perf_event_set_period(event);
	}

	event->hw.state = 0;
1054

P
Peter Zijlstra 已提交
1055 1056
	cpuc->events[idx] = event;
	__set_bit(idx, cpuc->active_mask);
1057
	__set_bit(idx, cpuc->running);
1058
	x86_pmu.enable(event);
P
Peter Zijlstra 已提交
1059
	perf_event_update_userpage(event);
1060 1061
}

1062
void perf_event_print_debug(void)
I
Ingo Molnar 已提交
1063
{
1064
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1065
	u64 pebs;
1066
	struct cpu_hw_events *cpuc;
1067
	unsigned long flags;
1068 1069
	int cpu, idx;

1070
	if (!x86_pmu.num_counters)
1071
		return;
I
Ingo Molnar 已提交
1072

1073
	local_irq_save(flags);
I
Ingo Molnar 已提交
1074 1075

	cpu = smp_processor_id();
1076
	cpuc = &per_cpu(cpu_hw_events, cpu);
I
Ingo Molnar 已提交
1077

1078
	if (x86_pmu.version >= 2) {
1079 1080 1081 1082
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1083
		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1084 1085 1086 1087 1088 1089

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1090
		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1091
	}
1092
	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
I
Ingo Molnar 已提交
1093

1094
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1095 1096
		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
I
Ingo Molnar 已提交
1097

1098
		prev_left = per_cpu(pmc_prev_left[idx], cpu);
I
Ingo Molnar 已提交
1099

1100
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
1101
			cpu, idx, pmc_ctrl);
1102
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
1103
			cpu, idx, pmc_count);
1104
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1105
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
1106
	}
1107
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1108 1109
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

1110
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1111 1112
			cpu, idx, pmc_count);
	}
1113
	local_irq_restore(flags);
I
Ingo Molnar 已提交
1114 1115
}

1116
void x86_pmu_stop(struct perf_event *event, int flags)
I
Ingo Molnar 已提交
1117
{
1118
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1119
	struct hw_perf_event *hwc = &event->hw;
I
Ingo Molnar 已提交
1120

P
Peter Zijlstra 已提交
1121 1122 1123 1124 1125 1126
	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
		x86_pmu.disable(event);
		cpuc->events[hwc->idx] = NULL;
		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
		hwc->state |= PERF_HES_STOPPED;
	}
1127

P
Peter Zijlstra 已提交
1128 1129 1130 1131 1132 1133 1134 1135
	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
		/*
		 * Drain the remaining delta count out of a event
		 * that we are disabling:
		 */
		x86_perf_event_update(event);
		hwc->state |= PERF_HES_UPTODATE;
	}
1136 1137
}

P
Peter Zijlstra 已提交
1138
static void x86_pmu_del(struct perf_event *event, int flags)
1139 1140 1141 1142
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int i;

1143 1144 1145 1146 1147
	/*
	 * If we're called during a txn, we don't need to do anything.
	 * The events never got scheduled and ->cancel_txn will truncate
	 * the event_list.
	 */
1148
	if (cpuc->group_flag & PERF_EVENT_TXN)
1149 1150
		return;

P
Peter Zijlstra 已提交
1151
	x86_pmu_stop(event, PERF_EF_UPDATE);
1152

1153 1154 1155 1156 1157 1158 1159 1160 1161 1162
	for (i = 0; i < cpuc->n_events; i++) {
		if (event == cpuc->event_list[i]) {

			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, event);

			while (++i < cpuc->n_events)
				cpuc->event_list[i-1] = cpuc->event_list[i];

			--cpuc->n_events;
1163
			break;
1164 1165
		}
	}
1166
	perf_event_update_userpage(event);
I
Ingo Molnar 已提交
1167 1168
}

1169
int x86_pmu_handle_irq(struct pt_regs *regs)
1170
{
1171
	struct perf_sample_data data;
1172 1173
	struct cpu_hw_events *cpuc;
	struct perf_event *event;
V
Vince Weaver 已提交
1174
	int idx, handled = 0;
1175 1176
	u64 val;

1177
	cpuc = &__get_cpu_var(cpu_hw_events);
1178

1179 1180 1181 1182 1183 1184 1185 1186 1187 1188
	/*
	 * Some chipsets need to unmask the LVTPC in a particular spot
	 * inside the nmi handler.  As a result, the unmasking was pushed
	 * into all the nmi handlers.
	 *
	 * This generic handler doesn't seem to have any issues where the
	 * unmasking occurs so it was left at the top.
	 */
	apic_write(APIC_LVTPC, APIC_DM_NMI);

1189
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1190 1191 1192 1193 1194 1195 1196 1197
		if (!test_bit(idx, cpuc->active_mask)) {
			/*
			 * Though we deactivated the counter some cpus
			 * might still deliver spurious interrupts still
			 * in flight. Catch them:
			 */
			if (__test_and_clear_bit(idx, cpuc->running))
				handled++;
1198
			continue;
1199
		}
1200

1201
		event = cpuc->events[idx];
1202

1203
		val = x86_perf_event_update(event);
1204
		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1205
			continue;
1206

1207
		/*
1208
		 * event overflow
1209
		 */
1210
		handled++;
1211
		perf_sample_data_init(&data, 0, event->hw.last_period);
1212

1213
		if (!x86_perf_event_set_period(event))
1214 1215
			continue;

1216
		if (perf_event_overflow(event, &data, regs))
P
Peter Zijlstra 已提交
1217
			x86_pmu_stop(event, 0);
1218
	}
1219

1220 1221 1222
	if (handled)
		inc_irq_stat(apic_perf_irqs);

1223 1224
	return handled;
}
1225

1226
void perf_events_lapic_init(void)
I
Ingo Molnar 已提交
1227
{
1228
	if (!x86_pmu.apic || !x86_pmu_initialized())
I
Ingo Molnar 已提交
1229
		return;
1230

I
Ingo Molnar 已提交
1231
	/*
1232
	 * Always use NMI for PMU
I
Ingo Molnar 已提交
1233
	 */
1234
	apic_write(APIC_LVTPC, APIC_DM_NMI);
I
Ingo Molnar 已提交
1235 1236 1237
}

static int __kprobes
1238
perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
I
Ingo Molnar 已提交
1239
{
1240
	if (!atomic_read(&active_events))
1241
		return NMI_DONE;
1242

1243
	return x86_pmu.handle_irq(regs);
I
Ingo Molnar 已提交
1244 1245
}

1246 1247
struct event_constraint emptyconstraint;
struct event_constraint unconstrained;
1248

1249 1250 1251 1252
static int __cpuinit
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;
1253
	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1254
	int ret = NOTIFY_OK;
1255 1256 1257

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
1258
		cpuc->kfree_on_online = NULL;
1259
		if (x86_pmu.cpu_prepare)
1260
			ret = x86_pmu.cpu_prepare(cpu);
1261 1262 1263
		break;

	case CPU_STARTING:
1264 1265
		if (x86_pmu.attr_rdpmc)
			set_in_cr4(X86_CR4_PCE);
1266 1267 1268 1269
		if (x86_pmu.cpu_starting)
			x86_pmu.cpu_starting(cpu);
		break;

1270 1271 1272 1273
	case CPU_ONLINE:
		kfree(cpuc->kfree_on_online);
		break;

1274 1275 1276 1277 1278
	case CPU_DYING:
		if (x86_pmu.cpu_dying)
			x86_pmu.cpu_dying(cpu);
		break;

1279
	case CPU_UP_CANCELED:
1280 1281 1282 1283 1284 1285 1286 1287 1288
	case CPU_DEAD:
		if (x86_pmu.cpu_dead)
			x86_pmu.cpu_dead(cpu);
		break;

	default:
		break;
	}

1289
	return ret;
1290 1291
}

1292 1293 1294 1295 1296 1297 1298 1299 1300 1301
static void __init pmu_check_apic(void)
{
	if (cpu_has_apic)
		return;

	x86_pmu.apic = 0;
	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
	pr_info("no hardware sampling interrupt available.\n");
}

1302 1303 1304 1305 1306
static struct attribute_group x86_pmu_format_group = {
	.name = "format",
	.attrs = NULL,
};

1307
static int __init init_hw_perf_events(void)
1308
{
1309
	struct x86_pmu_quirk *quirk;
1310
	struct event_constraint *c;
1311 1312
	int err;

1313
	pr_info("Performance Events: ");
1314

1315 1316
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1317
		err = intel_pmu_init();
1318
		break;
1319
	case X86_VENDOR_AMD:
1320
		err = amd_pmu_init();
1321
		break;
1322
	default:
1323
		return 0;
1324
	}
1325
	if (err != 0) {
1326
		pr_cont("no PMU driver, software events only.\n");
1327
		return 0;
1328
	}
1329

1330 1331
	pmu_check_apic();

1332
	/* sanity check that the hardware exists or is emulated */
1333
	if (!check_hw_exists())
1334
		return 0;
1335

1336
	pr_cont("%s PMU driver.\n", x86_pmu.name);
1337

1338 1339
	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
		quirk->func();
1340

1341
	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1342
		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1343 1344
		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
1345
	}
1346
	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
I
Ingo Molnar 已提交
1347

1348
	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1349
		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1350 1351
		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1352
	}
1353

1354
	x86_pmu.intel_ctrl |=
1355
		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
1356

1357
	perf_events_lapic_init();
1358
	register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
1359

1360
	unconstrained = (struct event_constraint)
1361
		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1362
				   0, x86_pmu.num_counters, 0);
1363

1364
	if (x86_pmu.event_constraints) {
1365 1366 1367 1368
		/*
		 * event on fixed counter2 (REF_CYCLES) only works on this
		 * counter, so do not extend mask to generic counters
		 */
1369
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1370 1371
			if (c->cmask != X86_RAW_EVENT_MASK
			    || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {
1372
				continue;
1373
			}
1374

1375 1376
			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
			c->weight += x86_pmu.num_counters;
1377 1378 1379
		}
	}

1380
	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1381
	x86_pmu_format_group.attrs = x86_pmu.format_attrs;
1382

I
Ingo Molnar 已提交
1383
	pr_info("... version:                %d\n",     x86_pmu.version);
1384 1385 1386
	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
I
Ingo Molnar 已提交
1387
	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1388
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1389
	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1390

P
Peter Zijlstra 已提交
1391
	perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1392
	perf_cpu_notifier(x86_pmu_notifier);
1393 1394

	return 0;
I
Ingo Molnar 已提交
1395
}
1396
early_initcall(init_hw_perf_events);
I
Ingo Molnar 已提交
1397

1398
static inline void x86_pmu_read(struct perf_event *event)
1399
{
1400
	x86_perf_event_update(event);
1401 1402
}

1403 1404 1405 1406 1407
/*
 * Start group events scheduling transaction
 * Set the flag to make pmu::enable() not perform the
 * schedulability test, it will be performed at commit time
 */
P
Peter Zijlstra 已提交
1408
static void x86_pmu_start_txn(struct pmu *pmu)
1409
{
P
Peter Zijlstra 已提交
1410
	perf_pmu_disable(pmu);
T
Tejun Heo 已提交
1411 1412
	__this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
	__this_cpu_write(cpu_hw_events.n_txn, 0);
1413 1414 1415 1416 1417 1418 1419
}

/*
 * Stop group events scheduling transaction
 * Clear the flag and pmu::enable() will perform the
 * schedulability test.
 */
P
Peter Zijlstra 已提交
1420
static void x86_pmu_cancel_txn(struct pmu *pmu)
1421
{
T
Tejun Heo 已提交
1422
	__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1423 1424 1425
	/*
	 * Truncate the collected events.
	 */
T
Tejun Heo 已提交
1426 1427
	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
P
Peter Zijlstra 已提交
1428
	perf_pmu_enable(pmu);
1429 1430 1431 1432 1433 1434 1435
}

/*
 * Commit group events scheduling transaction
 * Perform the group schedulability test as a whole
 * Return 0 if success
 */
P
Peter Zijlstra 已提交
1436
static int x86_pmu_commit_txn(struct pmu *pmu)
1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int assign[X86_PMC_IDX_MAX];
	int n, ret;

	n = cpuc->n_events;

	if (!x86_pmu_initialized())
		return -EAGAIN;

	ret = x86_pmu.schedule_events(cpuc, n, assign);
	if (ret)
		return ret;

	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));

1457
	cpuc->group_flag &= ~PERF_EVENT_TXN;
P
Peter Zijlstra 已提交
1458
	perf_pmu_enable(pmu);
1459 1460
	return 0;
}
1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489
/*
 * a fake_cpuc is used to validate event groups. Due to
 * the extra reg logic, we need to also allocate a fake
 * per_core and per_cpu structure. Otherwise, group events
 * using extra reg may conflict without the kernel being
 * able to catch this when the last event gets added to
 * the group.
 */
static void free_fake_cpuc(struct cpu_hw_events *cpuc)
{
	kfree(cpuc->shared_regs);
	kfree(cpuc);
}

static struct cpu_hw_events *allocate_fake_cpuc(void)
{
	struct cpu_hw_events *cpuc;
	int cpu = raw_smp_processor_id();

	cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
	if (!cpuc)
		return ERR_PTR(-ENOMEM);

	/* only needed, if we have extra_regs */
	if (x86_pmu.extra_regs) {
		cpuc->shared_regs = allocate_shared_regs(cpu);
		if (!cpuc->shared_regs)
			goto error;
	}
1490
	cpuc->is_fake = 1;
1491 1492 1493 1494 1495
	return cpuc;
error:
	free_fake_cpuc(cpuc);
	return ERR_PTR(-ENOMEM);
}
1496

1497 1498 1499 1500 1501 1502 1503 1504 1505
/*
 * validate that we can schedule this event
 */
static int validate_event(struct perf_event *event)
{
	struct cpu_hw_events *fake_cpuc;
	struct event_constraint *c;
	int ret = 0;

1506 1507 1508
	fake_cpuc = allocate_fake_cpuc();
	if (IS_ERR(fake_cpuc))
		return PTR_ERR(fake_cpuc);
1509 1510 1511 1512

	c = x86_pmu.get_event_constraints(fake_cpuc, event);

	if (!c || !c->weight)
1513
		ret = -EINVAL;
1514 1515 1516 1517

	if (x86_pmu.put_event_constraints)
		x86_pmu.put_event_constraints(fake_cpuc, event);

1518
	free_fake_cpuc(fake_cpuc);
1519 1520 1521 1522

	return ret;
}

1523 1524 1525 1526
/*
 * validate a single event group
 *
 * validation include:
1527 1528 1529
 *	- check events are compatible which each other
 *	- events do not compete for the same counter
 *	- number of events <= number of counters
1530 1531 1532 1533
 *
 * validation ensures the group can be loaded onto the
 * PMU if it was the only group available.
 */
1534 1535
static int validate_group(struct perf_event *event)
{
1536
	struct perf_event *leader = event->group_leader;
1537
	struct cpu_hw_events *fake_cpuc;
1538
	int ret = -EINVAL, n;
1539

1540 1541 1542
	fake_cpuc = allocate_fake_cpuc();
	if (IS_ERR(fake_cpuc))
		return PTR_ERR(fake_cpuc);
1543 1544 1545 1546 1547 1548
	/*
	 * the event is not yet connected with its
	 * siblings therefore we must first collect
	 * existing siblings, then add the new event
	 * before we can simulate the scheduling
	 */
1549
	n = collect_events(fake_cpuc, leader, true);
1550
	if (n < 0)
1551
		goto out;
1552

1553 1554
	fake_cpuc->n_events = n;
	n = collect_events(fake_cpuc, event, false);
1555
	if (n < 0)
1556
		goto out;
1557

1558
	fake_cpuc->n_events = n;
1559

1560
	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1561 1562

out:
1563
	free_fake_cpuc(fake_cpuc);
1564
	return ret;
1565 1566
}

1567
static int x86_pmu_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
1568
{
P
Peter Zijlstra 已提交
1569
	struct pmu *tmp;
I
Ingo Molnar 已提交
1570 1571
	int err;

1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582
	switch (event->attr.type) {
	case PERF_TYPE_RAW:
	case PERF_TYPE_HARDWARE:
	case PERF_TYPE_HW_CACHE:
		break;

	default:
		return -ENOENT;
	}

	err = __x86_pmu_event_init(event);
1583
	if (!err) {
1584 1585 1586 1587 1588 1589 1590 1591
		/*
		 * we temporarily connect event to its pmu
		 * such that validate_group() can classify
		 * it as an x86 event using is_x86_event()
		 */
		tmp = event->pmu;
		event->pmu = &pmu;

1592 1593
		if (event->group_leader != event)
			err = validate_group(event);
1594 1595
		else
			err = validate_event(event);
1596 1597

		event->pmu = tmp;
1598
	}
1599
	if (err) {
1600 1601
		if (event->destroy)
			event->destroy(event);
1602
	}
I
Ingo Molnar 已提交
1603

1604
	return err;
I
Ingo Molnar 已提交
1605
}
1606

1607 1608 1609 1610
static int x86_pmu_event_idx(struct perf_event *event)
{
	int idx = event->hw.idx;

1611 1612 1613
	if (!x86_pmu.attr_rdpmc)
		return 0;

1614 1615 1616 1617 1618 1619 1620 1621
	if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) {
		idx -= X86_PMC_IDX_FIXED;
		idx |= 1 << 30;
	}

	return idx + 1;
}

1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
static ssize_t get_attr_rdpmc(struct device *cdev,
			      struct device_attribute *attr,
			      char *buf)
{
	return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
}

static void change_rdpmc(void *info)
{
	bool enable = !!(unsigned long)info;

	if (enable)
		set_in_cr4(X86_CR4_PCE);
	else
		clear_in_cr4(X86_CR4_PCE);
}

static ssize_t set_attr_rdpmc(struct device *cdev,
			      struct device_attribute *attr,
			      const char *buf, size_t count)
{
1643 1644 1645 1646 1647 1648
	unsigned long val;
	ssize_t ret;

	ret = kstrtoul(buf, 0, &val);
	if (ret)
		return ret;
1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670

	if (!!val != !!x86_pmu.attr_rdpmc) {
		x86_pmu.attr_rdpmc = !!val;
		smp_call_function(change_rdpmc, (void *)val, 1);
	}

	return count;
}

static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);

static struct attribute *x86_pmu_attrs[] = {
	&dev_attr_rdpmc.attr,
	NULL,
};

static struct attribute_group x86_pmu_attr_group = {
	.attrs = x86_pmu_attrs,
};

static const struct attribute_group *x86_pmu_attr_groups[] = {
	&x86_pmu_attr_group,
1671
	&x86_pmu_format_group,
1672 1673 1674
	NULL,
};

1675 1676 1677 1678 1679 1680
static void x86_pmu_flush_branch_stack(void)
{
	if (x86_pmu.flush_branch_stack)
		x86_pmu.flush_branch_stack();
}

1681
static struct pmu pmu = {
1682 1683
	.pmu_enable		= x86_pmu_enable,
	.pmu_disable		= x86_pmu_disable,
P
Peter Zijlstra 已提交
1684

1685 1686
	.attr_groups	= x86_pmu_attr_groups,

1687
	.event_init	= x86_pmu_event_init,
P
Peter Zijlstra 已提交
1688

1689 1690 1691 1692 1693
	.add			= x86_pmu_add,
	.del			= x86_pmu_del,
	.start			= x86_pmu_start,
	.stop			= x86_pmu_stop,
	.read			= x86_pmu_read,
P
Peter Zijlstra 已提交
1694

1695 1696 1697
	.start_txn	= x86_pmu_start_txn,
	.cancel_txn	= x86_pmu_cancel_txn,
	.commit_txn	= x86_pmu_commit_txn,
1698 1699

	.event_idx	= x86_pmu_event_idx,
1700
	.flush_branch_stack	= x86_pmu_flush_branch_stack,
1701 1702
};

1703
void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
1704
{
1705 1706 1707 1708
	userpg->cap_usr_time = 0;
	userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc;
	userpg->pmc_width = x86_pmu.cntval_bits;

1709 1710 1711 1712 1713 1714
	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
		return;

	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
		return;

1715
	userpg->cap_usr_time = 1;
1716 1717 1718 1719 1720
	userpg->time_mult = this_cpu_read(cyc2ns);
	userpg->time_shift = CYC2NS_SCALE_FACTOR;
	userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
}

1721 1722 1723 1724 1725 1726
/*
 * callchain support
 */

static int backtrace_stack(void *data, char *name)
{
1727
	return 0;
1728 1729 1730 1731 1732 1733
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

1734
	perf_callchain_store(entry, addr);
1735 1736 1737 1738 1739
}

static const struct stacktrace_ops backtrace_ops = {
	.stack			= backtrace_stack,
	.address		= backtrace_address,
1740
	.walk_stack		= print_context_stack_bp,
1741 1742
};

1743 1744
void
perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1745
{
1746 1747
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
1748
		return;
1749 1750
	}

1751
	perf_callchain_store(entry, regs->ip);
1752

1753
	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1754 1755
}

1756 1757 1758 1759 1760 1761
static inline int
valid_user_frame(const void __user *fp, unsigned long size)
{
	return (__range_not_ok(fp, size, TASK_SIZE) == 0);
}

1762
#ifdef CONFIG_COMPAT
H
H. Peter Anvin 已提交
1763 1764 1765

#include <asm/compat.h>

1766 1767
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1768
{
1769 1770 1771
	/* 32-bit process in 64-bit kernel. */
	struct stack_frame_ia32 frame;
	const void __user *fp;
1772

1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784
	if (!test_thread_flag(TIF_IA32))
		return 0;

	fp = compat_ptr(regs->bp);
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
		unsigned long bytes;
		frame.next_frame     = 0;
		frame.return_address = 0;

		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
			break;
1785

1786 1787 1788
		if (!valid_user_frame(fp, sizeof(frame)))
			break;

1789
		perf_callchain_store(entry, frame.return_address);
1790 1791 1792
		fp = compat_ptr(frame.next_frame);
	}
	return 1;
1793
}
1794 1795 1796 1797 1798 1799 1800
#else
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
    return 0;
}
#endif
1801

1802 1803
void
perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1804 1805 1806 1807
{
	struct stack_frame frame;
	const void __user *fp;

1808 1809
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
1810
		return;
1811
	}
1812

1813
	fp = (void __user *)regs->bp;
1814

1815
	perf_callchain_store(entry, regs->ip);
1816

1817 1818 1819
	if (!current->mm)
		return;

1820 1821 1822
	if (perf_callchain_user32(regs, entry))
		return;

1823
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
1824
		unsigned long bytes;
1825
		frame.next_frame	     = NULL;
1826 1827
		frame.return_address = 0;

1828 1829
		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
1830 1831
			break;

1832 1833 1834
		if (!valid_user_frame(fp, sizeof(frame)))
			break;

1835
		perf_callchain_store(entry, frame.return_address);
1836
		fp = frame.next_frame;
1837 1838 1839
	}
}

1840 1841 1842
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
	unsigned long ip;
1843

1844 1845 1846 1847
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
		ip = perf_guest_cbs->get_guest_ip();
	else
		ip = instruction_pointer(regs);
1848

1849 1850 1851 1852 1853 1854
	return ip;
}

unsigned long perf_misc_flags(struct pt_regs *regs)
{
	int misc = 0;
1855

1856
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867
		if (perf_guest_cbs->is_user_mode())
			misc |= PERF_RECORD_MISC_GUEST_USER;
		else
			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
	} else {
		if (user_mode(regs))
			misc |= PERF_RECORD_MISC_USER;
		else
			misc |= PERF_RECORD_MISC_KERNEL;
	}

1868
	if (regs->flags & PERF_EFLAGS_EXACT)
P
Peter Zijlstra 已提交
1869
		misc |= PERF_RECORD_MISC_EXACT_IP;
1870 1871 1872

	return misc;
}
1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884

void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
{
	cap->version		= x86_pmu.version;
	cap->num_counters_gp	= x86_pmu.num_counters;
	cap->num_counters_fixed	= x86_pmu.num_counters_fixed;
	cap->bit_width_gp	= x86_pmu.cntval_bits;
	cap->bit_width_fixed	= x86_pmu.cntval_bits;
	cap->events_mask	= (unsigned int)x86_pmu.events_maskl;
	cap->events_mask_len	= x86_pmu.events_mask_len;
}
EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);