perf_event.c 34.9 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2
 * Performance events x86 architecture code
I
Ingo Molnar 已提交
3
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
I
Ingo Molnar 已提交
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
I
Ingo Molnar 已提交
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
#include <linux/module.h>
I
Ingo Molnar 已提交
21 22
#include <linux/kdebug.h>
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/slab.h>
25
#include <linux/cpu.h>
26
#include <linux/bitops.h>
I
Ingo Molnar 已提交
27 28

#include <asm/apic.h>
29
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
30
#include <asm/nmi.h>
31
#include <asm/compat.h>
32
#include <asm/smp.h>
33
#include <asm/alternative.h>
I
Ingo Molnar 已提交
34

35 36
#include "perf_event.h"

37 38 39 40 41 42 43 44 45 46 47
#if 0
#undef wrmsrl
#define wrmsrl(msr, val) 					\
do {								\
	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
			(unsigned long)(val));			\
	native_write_msr((msr), (u32)((u64)(val)), 		\
			(u32)((u64)(val) >> 32));		\
} while (0)
#endif

48
struct x86_pmu x86_pmu __read_mostly;
49

50
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
51 52
	.enabled = 1,
};
I
Ingo Molnar 已提交
53

54
u64 __read_mostly hw_cache_event_ids
55 56 57
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
58
u64 __read_mostly hw_cache_extra_regs
59 60 61
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
62

63
/*
64 65
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
66 67
 * Returns the delta events processed.
 */
68
u64 x86_perf_event_update(struct perf_event *event)
69
{
70
	struct hw_perf_event *hwc = &event->hw;
71
	int shift = 64 - x86_pmu.cntval_bits;
72
	u64 prev_raw_count, new_raw_count;
73
	int idx = hwc->idx;
74
	s64 delta;
75

76 77 78
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

79
	/*
80
	 * Careful: an NMI might modify the previous event value.
81 82 83
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
84
	 * count to the generic event atomically:
85 86
	 */
again:
87
	prev_raw_count = local64_read(&hwc->prev_count);
88
	rdmsrl(hwc->event_base, new_raw_count);
89

90
	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
91 92 93 94 95 96
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
97
	 * (event-)time and add that to the generic event.
98 99
	 *
	 * Careful, not all hw sign-extends above the physical width
100
	 * of the count.
101
	 */
102 103
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
104

105 106
	local64_add(delta, &event->count);
	local64_sub(delta, &hwc->period_left);
107 108

	return new_raw_count;
109 110
}

111 112 113 114 115
/*
 * Find and validate any extra registers to set up.
 */
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
116
	struct hw_perf_event_extra *reg;
117 118
	struct extra_reg *er;

119
	reg = &event->hw.extra_reg;
120 121 122 123 124 125 126 127 128

	if (!x86_pmu.extra_regs)
		return 0;

	for (er = x86_pmu.extra_regs; er->msr; er++) {
		if (er->event != (config & er->config_mask))
			continue;
		if (event->attr.config1 & ~er->valid_mask)
			return -EINVAL;
129 130 131 132

		reg->idx = er->idx;
		reg->config = event->attr.config1;
		reg->reg = er->msr;
133 134 135 136 137
		break;
	}
	return 0;
}

138
static atomic_t active_events;
P
Peter Zijlstra 已提交
139 140
static DEFINE_MUTEX(pmc_reserve_mutex);

141 142
#ifdef CONFIG_X86_LOCAL_APIC

P
Peter Zijlstra 已提交
143 144 145 146
static bool reserve_pmc_hardware(void)
{
	int i;

147
	for (i = 0; i < x86_pmu.num_counters; i++) {
148
		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
P
Peter Zijlstra 已提交
149 150 151
			goto perfctr_fail;
	}

152
	for (i = 0; i < x86_pmu.num_counters; i++) {
153
		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
P
Peter Zijlstra 已提交
154 155 156 157 158 159 160
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
161
		release_evntsel_nmi(x86_pmu_config_addr(i));
P
Peter Zijlstra 已提交
162

163
	i = x86_pmu.num_counters;
P
Peter Zijlstra 已提交
164 165 166

perfctr_fail:
	for (i--; i >= 0; i--)
167
		release_perfctr_nmi(x86_pmu_event_addr(i));
P
Peter Zijlstra 已提交
168 169 170 171 172 173 174 175

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

176
	for (i = 0; i < x86_pmu.num_counters; i++) {
177 178
		release_perfctr_nmi(x86_pmu_event_addr(i));
		release_evntsel_nmi(x86_pmu_config_addr(i));
P
Peter Zijlstra 已提交
179 180 181
	}
}

182 183 184 185 186 187 188
#else

static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}

#endif

189 190 191
static bool check_hw_exists(void)
{
	u64 val, val_new = 0;
192
	int i, reg, ret = 0;
193

194 195 196 197 198
	/*
	 * Check to see if the BIOS enabled any of the counters, if so
	 * complain and bail.
	 */
	for (i = 0; i < x86_pmu.num_counters; i++) {
199
		reg = x86_pmu_config_addr(i);
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
			goto bios_fail;
	}

	if (x86_pmu.num_counters_fixed) {
		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
			if (val & (0x03 << i*4))
				goto bios_fail;
		}
	}

	/*
	 * Now write a value and read it back to see if it matches,
	 * this is needed to detect certain hardware emulators (qemu/kvm)
	 * that don't trap on the MSR access and always return 0s.
	 */
223
	val = 0xabcdUL;
224 225
	ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
	ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
226
	if (ret || val != val_new)
227
		goto msr_fail;
228 229

	return true;
230 231

bios_fail:
232 233 234 235
	/*
	 * We still allow the PMU driver to operate:
	 */
	printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
236
	printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
237 238

	return true;
239 240 241

msr_fail:
	printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
242

243
	return false;
244 245
}

246
static void hw_perf_event_destroy(struct perf_event *event)
P
Peter Zijlstra 已提交
247
{
248
	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
249
		release_pmc_hardware();
250
		release_ds_buffers();
P
Peter Zijlstra 已提交
251 252 253 254
		mutex_unlock(&pmc_reserve_mutex);
	}
}

255 256 257 258 259
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

260
static inline int
261
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
262
{
263
	struct perf_event_attr *attr = &event->attr;
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;
290 291
	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
	return x86_pmu_extra_regs(val, event);
292 293
}

294
int x86_setup_perfctr(struct perf_event *event)
295 296 297 298 299
{
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
	u64 config;

300
	if (!is_sampling_event(event)) {
301 302
		hwc->sample_period = x86_pmu.max_period;
		hwc->last_period = hwc->sample_period;
303
		local64_set(&hwc->period_left, hwc->sample_period);
304 305 306 307 308 309 310 311 312 313 314
	} else {
		/*
		 * If we have a PMU initialized but no APIC
		 * interrupts, we cannot sample hardware
		 * events (user-space has to fall back and
		 * sample via a hrtimer based software event):
		 */
		if (!x86_pmu.apic)
			return -EOPNOTSUPP;
	}

315 316 317 318
	/*
	 * Do not allow config1 (extended registers) to propagate,
	 * there's no sane user-space generalization yet:
	 */
319
	if (attr->type == PERF_TYPE_RAW)
320
		return 0;
321 322

	if (attr->type == PERF_TYPE_HW_CACHE)
323
		return set_ext_hw_attr(hwc, event);
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;

	/*
	 * The generic map:
	 */
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

	/*
	 * Branch tracing:
	 */
P
Peter Zijlstra 已提交
342 343
	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
	    !attr->freq && hwc->sample_period == 1) {
344
		/* BTS is not supported by this architecture. */
345
		if (!x86_pmu.bts_active)
346 347 348 349 350 351 352 353 354 355 356
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (!attr->exclude_kernel)
			return -EOPNOTSUPP;
	}

	hwc->config |= config;

	return 0;
}
357

358
int x86_pmu_hw_config(struct perf_event *event)
359
{
P
Peter Zijlstra 已提交
360 361 362 363
	if (event->attr.precise_ip) {
		int precise = 0;

		/* Support for constant skid */
364
		if (x86_pmu.pebs_active) {
P
Peter Zijlstra 已提交
365 366
			precise++;

367 368 369 370
			/* Support for IP fixup */
			if (x86_pmu.lbr_nr)
				precise++;
		}
P
Peter Zijlstra 已提交
371 372 373 374 375

		if (event->attr.precise_ip > precise)
			return -EOPNOTSUPP;
	}

376 377 378 379
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
380
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
381 382 383 384

	/*
	 * Count user and OS events unless requested not to
	 */
385 386 387 388
	if (!event->attr.exclude_user)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!event->attr.exclude_kernel)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
389

390 391
	if (event->attr.type == PERF_TYPE_RAW)
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
392

393
	return x86_setup_perfctr(event);
394 395
}

I
Ingo Molnar 已提交
396
/*
397
 * Setup the hardware configuration for a given attr_type
I
Ingo Molnar 已提交
398
 */
399
static int __x86_pmu_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
400
{
P
Peter Zijlstra 已提交
401
	int err;
I
Ingo Molnar 已提交
402

403 404
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
405

P
Peter Zijlstra 已提交
406
	err = 0;
407
	if (!atomic_inc_not_zero(&active_events)) {
P
Peter Zijlstra 已提交
408
		mutex_lock(&pmc_reserve_mutex);
409
		if (atomic_read(&active_events) == 0) {
410 411
			if (!reserve_pmc_hardware())
				err = -EBUSY;
412 413
			else
				reserve_ds_buffers();
414 415
		}
		if (!err)
416
			atomic_inc(&active_events);
P
Peter Zijlstra 已提交
417 418 419 420 421
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

422
	event->destroy = hw_perf_event_destroy;
423

424 425 426
	event->hw.idx = -1;
	event->hw.last_cpu = -1;
	event->hw.last_tag = ~0ULL;
427

428 429 430
	/* mark unused */
	event->hw.extra_reg.idx = EXTRA_REG_NONE;

431
	return x86_pmu.hw_config(event);
432 433
}

434
void x86_pmu_disable_all(void)
435
{
436
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
437 438
	int idx;

439
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
440 441
		u64 val;

442
		if (!test_bit(idx, cpuc->active_mask))
443
			continue;
444
		rdmsrl(x86_pmu_config_addr(idx), val);
445
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
446
			continue;
447
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
448
		wrmsrl(x86_pmu_config_addr(idx), val);
449 450 451
	}
}

P
Peter Zijlstra 已提交
452
static void x86_pmu_disable(struct pmu *pmu)
453
{
454 455
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

456
	if (!x86_pmu_initialized())
457
		return;
458

459 460 461 462 463 464
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
465 466

	x86_pmu.disable_all();
467
}
I
Ingo Molnar 已提交
468

469
void x86_pmu_enable_all(int added)
470
{
471
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
472 473
	int idx;

474
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
475
		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
476

477
		if (!test_bit(idx, cpuc->active_mask))
478
			continue;
479

480
		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
481 482 483
	}
}

P
Peter Zijlstra 已提交
484
static struct pmu pmu;
485 486 487 488 489 490

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

491
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
492
{
493
	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
494
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
495
	int i, j, w, wmax, num = 0;
496 497 498 499 500
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

	for (i = 0; i < n; i++) {
501 502
		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
		constraints[i] = c;
503 504
	}

505 506 507
	/*
	 * fastpath, try to reuse previous register
	 */
508
	for (i = 0; i < n; i++) {
509
		hwc = &cpuc->event_list[i]->hw;
510
		c = constraints[i];
511 512 513 514 515 516

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
517
		if (!test_bit(hwc->idx, c->idxmsk))
518 519 520 521 522 523
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

P
Peter Zijlstra 已提交
524
		__set_bit(hwc->idx, used_mask);
525 526 527
		if (assign)
			assign[i] = hwc->idx;
	}
528
	if (i == n)
529 530 531 532 533 534 535 536
		goto done;

	/*
	 * begin slow path
	 */

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

537 538 539 540 541 542 543 544 545
	/*
	 * weight = number of possible counters
	 *
	 * 1    = most constrained, only works on one counter
	 * wmax = least constrained, works on any counter
	 *
	 * assign events to counters starting with most
	 * constrained events.
	 */
546
	wmax = x86_pmu.num_counters;
547 548 549 550 551 552

	/*
	 * when fixed event counters are present,
	 * wmax is incremented by 1 to account
	 * for one more choice
	 */
553
	if (x86_pmu.num_counters_fixed)
554 555
		wmax++;

556
	for (w = 1, num = n; num && w <= wmax; w++) {
557
		/* for each event */
558
		for (i = 0; num && i < n; i++) {
559
			c = constraints[i];
560 561
			hwc = &cpuc->event_list[i]->hw;

562
			if (c->weight != w)
563 564
				continue;

565
			for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
566 567 568 569 570 571 572
				if (!test_bit(j, used_mask))
					break;
			}

			if (j == X86_PMC_IDX_MAX)
				break;

P
Peter Zijlstra 已提交
573
			__set_bit(j, used_mask);
574

575 576 577 578 579
			if (assign)
				assign[i] = j;
			num--;
		}
	}
580
done:
581 582 583 584 585 586 587 588 589 590
	/*
	 * scheduling failed or is just a simulation,
	 * free resources if necessary
	 */
	if (!assign || num) {
		for (i = 0; i < n; i++) {
			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
		}
	}
591
	return num ? -EINVAL : 0;
592 593 594 595 596 597 598 599 600 601 602
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

603
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
604 605 606 607 608 609

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
610
			return -EINVAL;
611 612 613 614 615 616 617 618
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
619
		    event->state <= PERF_EVENT_STATE_OFF)
620 621 622
			continue;

		if (n >= max_count)
623
			return -EINVAL;
624 625 626 627 628 629 630 631

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
632
				struct cpu_hw_events *cpuc, int i)
633
{
634 635 636 637 638
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
639 640 641 642 643 644

	if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
		hwc->config_base = 0;
		hwc->event_base	= 0;
	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
645
		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
646
	} else {
647 648
		hwc->config_base = x86_pmu_config_addr(hwc->idx);
		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
649 650 651
	}
}

652 653 654 655 656 657 658 659 660
static inline int match_prev_assignment(struct hw_perf_event *hwc,
					struct cpu_hw_events *cpuc,
					int i)
{
	return hwc->idx == cpuc->assign[i] &&
		hwc->last_cpu == smp_processor_id() &&
		hwc->last_tag == cpuc->tags[i];
}

P
Peter Zijlstra 已提交
661
static void x86_pmu_start(struct perf_event *event, int flags);
662

P
Peter Zijlstra 已提交
663
static void x86_pmu_enable(struct pmu *pmu)
664
{
665 666 667
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	struct perf_event *event;
	struct hw_perf_event *hwc;
668
	int i, added = cpuc->n_added;
669

670
	if (!x86_pmu_initialized())
671
		return;
672 673 674 675

	if (cpuc->enabled)
		return;

676
	if (cpuc->n_added) {
677
		int n_running = cpuc->n_events - cpuc->n_added;
678 679 680 681 682 683 684
		/*
		 * apply assignment obtained either from
		 * hw_perf_group_sched_in() or x86_pmu_enable()
		 *
		 * step1: save events moving to new counters
		 * step2: reprogram moved events into new counters
		 */
685
		for (i = 0; i < n_running; i++) {
686 687 688
			event = cpuc->event_list[i];
			hwc = &event->hw;

689 690 691 692 693 694 695 696
			/*
			 * we can avoid reprogramming counter if:
			 * - assigned same counter as last time
			 * - running on same CPU as last time
			 * - no other event has used the counter since
			 */
			if (hwc->idx == -1 ||
			    match_prev_assignment(hwc, cpuc, i))
697 698
				continue;

P
Peter Zijlstra 已提交
699 700 701 702 703 704 705 706
			/*
			 * Ensure we don't accidentally enable a stopped
			 * counter simply because we rescheduled.
			 */
			if (hwc->state & PERF_HES_STOPPED)
				hwc->state |= PERF_HES_ARCH;

			x86_pmu_stop(event, PERF_EF_UPDATE);
707 708 709 710 711 712
		}

		for (i = 0; i < cpuc->n_events; i++) {
			event = cpuc->event_list[i];
			hwc = &event->hw;

713
			if (!match_prev_assignment(hwc, cpuc, i))
714
				x86_assign_hw_event(event, cpuc, i);
715 716
			else if (i < n_running)
				continue;
717

P
Peter Zijlstra 已提交
718 719 720 721
			if (hwc->state & PERF_HES_ARCH)
				continue;

			x86_pmu_start(event, PERF_EF_RELOAD);
722 723 724 725
		}
		cpuc->n_added = 0;
		perf_events_lapic_init();
	}
726 727 728 729

	cpuc->enabled = 1;
	barrier();

730
	x86_pmu.enable_all(added);
731 732
}

733
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
I
Ingo Molnar 已提交
734

735 736
/*
 * Set the next IRQ period, based on the hwc->period_left value.
737
 * To be called with the event disabled in hw:
738
 */
739
int x86_perf_event_set_period(struct perf_event *event)
I
Ingo Molnar 已提交
740
{
741
	struct hw_perf_event *hwc = &event->hw;
742
	s64 left = local64_read(&hwc->period_left);
743
	s64 period = hwc->sample_period;
744
	int ret = 0, idx = hwc->idx;
745

746 747 748
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

749
	/*
750
	 * If we are way outside a reasonable range then just skip forward:
751 752 753
	 */
	if (unlikely(left <= -period)) {
		left = period;
754
		local64_set(&hwc->period_left, left);
755
		hwc->last_period = period;
756
		ret = 1;
757 758 759 760
	}

	if (unlikely(left <= 0)) {
		left += period;
761
		local64_set(&hwc->period_left, left);
762
		hwc->last_period = period;
763
		ret = 1;
764
	}
765
	/*
766
	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
767 768 769
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
770

771 772 773
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

774
	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
775 776

	/*
777
	 * The hw event starts counting from this event offset,
778 779
	 * mark it to be able to extra future deltas:
	 */
780
	local64_set(&hwc->prev_count, (u64)-left);
781

782
	wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
783 784 785 786 787 788 789

	/*
	 * Due to erratum on certan cpu we need
	 * a second write to be sure the register
	 * is updated properly
	 */
	if (x86_pmu.perfctr_second_write) {
790
		wrmsrl(hwc->event_base,
791
			(u64)(-left) & x86_pmu.cntval_mask);
792
	}
793

794
	perf_event_update_userpage(event);
795

796
	return ret;
797 798
}

799
void x86_pmu_enable_event(struct perf_event *event)
800
{
T
Tejun Heo 已提交
801
	if (__this_cpu_read(cpu_hw_events.enabled))
802 803
		__x86_pmu_enable_event(&event->hw,
				       ARCH_PERFMON_EVENTSEL_ENABLE);
I
Ingo Molnar 已提交
804 805
}

806
/*
P
Peter Zijlstra 已提交
807
 * Add a single event to the PMU.
808 809 810
 *
 * The event is added to the group of enabled events
 * but only if it can be scehduled with existing events.
811
 */
P
Peter Zijlstra 已提交
812
static int x86_pmu_add(struct perf_event *event, int flags)
813 814
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
815 816 817
	struct hw_perf_event *hwc;
	int assign[X86_PMC_IDX_MAX];
	int n, n0, ret;
818

819
	hwc = &event->hw;
820

P
Peter Zijlstra 已提交
821
	perf_pmu_disable(event->pmu);
822
	n0 = cpuc->n_events;
823 824 825
	ret = n = collect_events(cpuc, event, false);
	if (ret < 0)
		goto out;
826

P
Peter Zijlstra 已提交
827 828 829 830
	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
	if (!(flags & PERF_EF_START))
		hwc->state |= PERF_HES_ARCH;

831 832
	/*
	 * If group events scheduling transaction was started,
L
Lucas De Marchi 已提交
833
	 * skip the schedulability test here, it will be performed
P
Peter Zijlstra 已提交
834
	 * at commit time (->commit_txn) as a whole
835
	 */
836
	if (cpuc->group_flag & PERF_EVENT_TXN)
837
		goto done_collect;
838

839
	ret = x86_pmu.schedule_events(cpuc, n, assign);
840
	if (ret)
841
		goto out;
842 843 844 845 846
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));
847

848
done_collect:
849
	cpuc->n_events = n;
850
	cpuc->n_added += n - n0;
851
	cpuc->n_txn += n - n0;
852

853 854
	ret = 0;
out:
P
Peter Zijlstra 已提交
855
	perf_pmu_enable(event->pmu);
856
	return ret;
I
Ingo Molnar 已提交
857 858
}

P
Peter Zijlstra 已提交
859
static void x86_pmu_start(struct perf_event *event, int flags)
860
{
P
Peter Zijlstra 已提交
861 862 863
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int idx = event->hw.idx;

P
Peter Zijlstra 已提交
864 865 866 867 868 869 870 871 872 873 874 875
	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
		return;

	if (WARN_ON_ONCE(idx == -1))
		return;

	if (flags & PERF_EF_RELOAD) {
		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
		x86_perf_event_set_period(event);
	}

	event->hw.state = 0;
876

P
Peter Zijlstra 已提交
877 878
	cpuc->events[idx] = event;
	__set_bit(idx, cpuc->active_mask);
879
	__set_bit(idx, cpuc->running);
880
	x86_pmu.enable(event);
P
Peter Zijlstra 已提交
881
	perf_event_update_userpage(event);
882 883
}

884
void perf_event_print_debug(void)
I
Ingo Molnar 已提交
885
{
886
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
887
	u64 pebs;
888
	struct cpu_hw_events *cpuc;
889
	unsigned long flags;
890 891
	int cpu, idx;

892
	if (!x86_pmu.num_counters)
893
		return;
I
Ingo Molnar 已提交
894

895
	local_irq_save(flags);
I
Ingo Molnar 已提交
896 897

	cpu = smp_processor_id();
898
	cpuc = &per_cpu(cpu_hw_events, cpu);
I
Ingo Molnar 已提交
899

900
	if (x86_pmu.version >= 2) {
901 902 903 904
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
905
		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
906 907 908 909 910 911

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
912
		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
913
	}
914
	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
I
Ingo Molnar 已提交
915

916
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
917 918
		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
I
Ingo Molnar 已提交
919

920
		prev_left = per_cpu(pmc_prev_left[idx], cpu);
I
Ingo Molnar 已提交
921

922
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
923
			cpu, idx, pmc_ctrl);
924
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
925
			cpu, idx, pmc_count);
926
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
927
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
928
	}
929
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
930 931
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

932
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
933 934
			cpu, idx, pmc_count);
	}
935
	local_irq_restore(flags);
I
Ingo Molnar 已提交
936 937
}

938
void x86_pmu_stop(struct perf_event *event, int flags)
I
Ingo Molnar 已提交
939
{
940
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
941
	struct hw_perf_event *hwc = &event->hw;
I
Ingo Molnar 已提交
942

P
Peter Zijlstra 已提交
943 944 945 946 947 948
	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
		x86_pmu.disable(event);
		cpuc->events[hwc->idx] = NULL;
		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
		hwc->state |= PERF_HES_STOPPED;
	}
949

P
Peter Zijlstra 已提交
950 951 952 953 954 955 956 957
	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
		/*
		 * Drain the remaining delta count out of a event
		 * that we are disabling:
		 */
		x86_perf_event_update(event);
		hwc->state |= PERF_HES_UPTODATE;
	}
958 959
}

P
Peter Zijlstra 已提交
960
static void x86_pmu_del(struct perf_event *event, int flags)
961 962 963 964
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int i;

965 966 967 968 969
	/*
	 * If we're called during a txn, we don't need to do anything.
	 * The events never got scheduled and ->cancel_txn will truncate
	 * the event_list.
	 */
970
	if (cpuc->group_flag & PERF_EVENT_TXN)
971 972
		return;

P
Peter Zijlstra 已提交
973
	x86_pmu_stop(event, PERF_EF_UPDATE);
974

975 976 977 978 979 980 981 982 983 984
	for (i = 0; i < cpuc->n_events; i++) {
		if (event == cpuc->event_list[i]) {

			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, event);

			while (++i < cpuc->n_events)
				cpuc->event_list[i-1] = cpuc->event_list[i];

			--cpuc->n_events;
985
			break;
986 987
		}
	}
988
	perf_event_update_userpage(event);
I
Ingo Molnar 已提交
989 990
}

991
int x86_pmu_handle_irq(struct pt_regs *regs)
992
{
993
	struct perf_sample_data data;
994 995
	struct cpu_hw_events *cpuc;
	struct perf_event *event;
V
Vince Weaver 已提交
996
	int idx, handled = 0;
997 998
	u64 val;

999
	perf_sample_data_init(&data, 0);
1000

1001
	cpuc = &__get_cpu_var(cpu_hw_events);
1002

1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
	/*
	 * Some chipsets need to unmask the LVTPC in a particular spot
	 * inside the nmi handler.  As a result, the unmasking was pushed
	 * into all the nmi handlers.
	 *
	 * This generic handler doesn't seem to have any issues where the
	 * unmasking occurs so it was left at the top.
	 */
	apic_write(APIC_LVTPC, APIC_DM_NMI);

1013
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1014 1015 1016 1017 1018 1019 1020 1021
		if (!test_bit(idx, cpuc->active_mask)) {
			/*
			 * Though we deactivated the counter some cpus
			 * might still deliver spurious interrupts still
			 * in flight. Catch them:
			 */
			if (__test_and_clear_bit(idx, cpuc->running))
				handled++;
1022
			continue;
1023
		}
1024

1025
		event = cpuc->events[idx];
1026

1027
		val = x86_perf_event_update(event);
1028
		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1029
			continue;
1030

1031
		/*
1032
		 * event overflow
1033
		 */
1034
		handled++;
1035
		data.period	= event->hw.last_period;
1036

1037
		if (!x86_perf_event_set_period(event))
1038 1039
			continue;

1040
		if (perf_event_overflow(event, &data, regs))
P
Peter Zijlstra 已提交
1041
			x86_pmu_stop(event, 0);
1042
	}
1043

1044 1045 1046
	if (handled)
		inc_irq_stat(apic_perf_irqs);

1047 1048
	return handled;
}
1049

1050
void perf_events_lapic_init(void)
I
Ingo Molnar 已提交
1051
{
1052
	if (!x86_pmu.apic || !x86_pmu_initialized())
I
Ingo Molnar 已提交
1053
		return;
1054

I
Ingo Molnar 已提交
1055
	/*
1056
	 * Always use NMI for PMU
I
Ingo Molnar 已提交
1057
	 */
1058
	apic_write(APIC_LVTPC, APIC_DM_NMI);
I
Ingo Molnar 已提交
1059 1060 1061
}

static int __kprobes
1062
perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
I
Ingo Molnar 已提交
1063
{
1064
	if (!atomic_read(&active_events))
1065
		return NMI_DONE;
1066

1067
	return x86_pmu.handle_irq(regs);
I
Ingo Molnar 已提交
1068 1069
}

1070 1071
struct event_constraint emptyconstraint;
struct event_constraint unconstrained;
1072

1073 1074 1075 1076
static int __cpuinit
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;
1077
	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1078
	int ret = NOTIFY_OK;
1079 1080 1081

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
1082
		cpuc->kfree_on_online = NULL;
1083
		if (x86_pmu.cpu_prepare)
1084
			ret = x86_pmu.cpu_prepare(cpu);
1085 1086 1087 1088 1089 1090 1091
		break;

	case CPU_STARTING:
		if (x86_pmu.cpu_starting)
			x86_pmu.cpu_starting(cpu);
		break;

1092 1093 1094 1095
	case CPU_ONLINE:
		kfree(cpuc->kfree_on_online);
		break;

1096 1097 1098 1099 1100
	case CPU_DYING:
		if (x86_pmu.cpu_dying)
			x86_pmu.cpu_dying(cpu);
		break;

1101
	case CPU_UP_CANCELED:
1102 1103 1104 1105 1106 1107 1108 1109 1110
	case CPU_DEAD:
		if (x86_pmu.cpu_dead)
			x86_pmu.cpu_dead(cpu);
		break;

	default:
		break;
	}

1111
	return ret;
1112 1113
}

1114 1115 1116 1117 1118 1119 1120 1121 1122 1123
static void __init pmu_check_apic(void)
{
	if (cpu_has_apic)
		return;

	x86_pmu.apic = 0;
	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
	pr_info("no hardware sampling interrupt available.\n");
}

1124
static int __init init_hw_perf_events(void)
1125
{
1126
	struct event_constraint *c;
1127 1128
	int err;

1129
	pr_info("Performance Events: ");
1130

1131 1132
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1133
		err = intel_pmu_init();
1134
		break;
1135
	case X86_VENDOR_AMD:
1136
		err = amd_pmu_init();
1137
		break;
1138
	default:
1139
		return 0;
1140
	}
1141
	if (err != 0) {
1142
		pr_cont("no PMU driver, software events only.\n");
1143
		return 0;
1144
	}
1145

1146 1147
	pmu_check_apic();

1148
	/* sanity check that the hardware exists or is emulated */
1149
	if (!check_hw_exists())
1150
		return 0;
1151

1152
	pr_cont("%s PMU driver.\n", x86_pmu.name);
1153

1154 1155 1156
	if (x86_pmu.quirks)
		x86_pmu.quirks();

1157
	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1158
		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1159 1160
		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
1161
	}
1162
	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
I
Ingo Molnar 已提交
1163

1164
	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1165
		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1166 1167
		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1168
	}
1169

1170
	x86_pmu.intel_ctrl |=
1171
		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
1172

1173
	perf_events_lapic_init();
1174
	register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
1175

1176
	unconstrained = (struct event_constraint)
1177 1178
		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
				   0, x86_pmu.num_counters);
1179

1180 1181
	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1182
			if (c->cmask != X86_RAW_EVENT_MASK)
1183 1184
				continue;

1185 1186
			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
			c->weight += x86_pmu.num_counters;
1187 1188 1189
		}
	}

I
Ingo Molnar 已提交
1190
	pr_info("... version:                %d\n",     x86_pmu.version);
1191 1192 1193
	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
I
Ingo Molnar 已提交
1194
	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1195
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1196
	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1197

P
Peter Zijlstra 已提交
1198
	perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1199
	perf_cpu_notifier(x86_pmu_notifier);
1200 1201

	return 0;
I
Ingo Molnar 已提交
1202
}
1203
early_initcall(init_hw_perf_events);
I
Ingo Molnar 已提交
1204

1205
static inline void x86_pmu_read(struct perf_event *event)
1206
{
1207
	x86_perf_event_update(event);
1208 1209
}

1210 1211 1212 1213 1214
/*
 * Start group events scheduling transaction
 * Set the flag to make pmu::enable() not perform the
 * schedulability test, it will be performed at commit time
 */
P
Peter Zijlstra 已提交
1215
static void x86_pmu_start_txn(struct pmu *pmu)
1216
{
P
Peter Zijlstra 已提交
1217
	perf_pmu_disable(pmu);
T
Tejun Heo 已提交
1218 1219
	__this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
	__this_cpu_write(cpu_hw_events.n_txn, 0);
1220 1221 1222 1223 1224 1225 1226
}

/*
 * Stop group events scheduling transaction
 * Clear the flag and pmu::enable() will perform the
 * schedulability test.
 */
P
Peter Zijlstra 已提交
1227
static void x86_pmu_cancel_txn(struct pmu *pmu)
1228
{
T
Tejun Heo 已提交
1229
	__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1230 1231 1232
	/*
	 * Truncate the collected events.
	 */
T
Tejun Heo 已提交
1233 1234
	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
P
Peter Zijlstra 已提交
1235
	perf_pmu_enable(pmu);
1236 1237 1238 1239 1240 1241 1242
}

/*
 * Commit group events scheduling transaction
 * Perform the group schedulability test as a whole
 * Return 0 if success
 */
P
Peter Zijlstra 已提交
1243
static int x86_pmu_commit_txn(struct pmu *pmu)
1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int assign[X86_PMC_IDX_MAX];
	int n, ret;

	n = cpuc->n_events;

	if (!x86_pmu_initialized())
		return -EAGAIN;

	ret = x86_pmu.schedule_events(cpuc, n, assign);
	if (ret)
		return ret;

	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));

1264
	cpuc->group_flag &= ~PERF_EVENT_TXN;
P
Peter Zijlstra 已提交
1265
	perf_pmu_enable(pmu);
1266 1267
	return 0;
}
1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301
/*
 * a fake_cpuc is used to validate event groups. Due to
 * the extra reg logic, we need to also allocate a fake
 * per_core and per_cpu structure. Otherwise, group events
 * using extra reg may conflict without the kernel being
 * able to catch this when the last event gets added to
 * the group.
 */
static void free_fake_cpuc(struct cpu_hw_events *cpuc)
{
	kfree(cpuc->shared_regs);
	kfree(cpuc);
}

static struct cpu_hw_events *allocate_fake_cpuc(void)
{
	struct cpu_hw_events *cpuc;
	int cpu = raw_smp_processor_id();

	cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
	if (!cpuc)
		return ERR_PTR(-ENOMEM);

	/* only needed, if we have extra_regs */
	if (x86_pmu.extra_regs) {
		cpuc->shared_regs = allocate_shared_regs(cpu);
		if (!cpuc->shared_regs)
			goto error;
	}
	return cpuc;
error:
	free_fake_cpuc(cpuc);
	return ERR_PTR(-ENOMEM);
}
1302

1303 1304 1305 1306 1307 1308 1309 1310 1311
/*
 * validate that we can schedule this event
 */
static int validate_event(struct perf_event *event)
{
	struct cpu_hw_events *fake_cpuc;
	struct event_constraint *c;
	int ret = 0;

1312 1313 1314
	fake_cpuc = allocate_fake_cpuc();
	if (IS_ERR(fake_cpuc))
		return PTR_ERR(fake_cpuc);
1315 1316 1317 1318

	c = x86_pmu.get_event_constraints(fake_cpuc, event);

	if (!c || !c->weight)
1319
		ret = -EINVAL;
1320 1321 1322 1323

	if (x86_pmu.put_event_constraints)
		x86_pmu.put_event_constraints(fake_cpuc, event);

1324
	free_fake_cpuc(fake_cpuc);
1325 1326 1327 1328

	return ret;
}

1329 1330 1331 1332
/*
 * validate a single event group
 *
 * validation include:
1333 1334 1335
 *	- check events are compatible which each other
 *	- events do not compete for the same counter
 *	- number of events <= number of counters
1336 1337 1338 1339
 *
 * validation ensures the group can be loaded onto the
 * PMU if it was the only group available.
 */
1340 1341
static int validate_group(struct perf_event *event)
{
1342
	struct perf_event *leader = event->group_leader;
1343
	struct cpu_hw_events *fake_cpuc;
1344
	int ret = -EINVAL, n;
1345

1346 1347 1348
	fake_cpuc = allocate_fake_cpuc();
	if (IS_ERR(fake_cpuc))
		return PTR_ERR(fake_cpuc);
1349 1350 1351 1352 1353 1354
	/*
	 * the event is not yet connected with its
	 * siblings therefore we must first collect
	 * existing siblings, then add the new event
	 * before we can simulate the scheduling
	 */
1355
	n = collect_events(fake_cpuc, leader, true);
1356
	if (n < 0)
1357
		goto out;
1358

1359 1360
	fake_cpuc->n_events = n;
	n = collect_events(fake_cpuc, event, false);
1361
	if (n < 0)
1362
		goto out;
1363

1364
	fake_cpuc->n_events = n;
1365

1366
	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1367 1368

out:
1369
	free_fake_cpuc(fake_cpuc);
1370
	return ret;
1371 1372
}

1373
static int x86_pmu_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
1374
{
P
Peter Zijlstra 已提交
1375
	struct pmu *tmp;
I
Ingo Molnar 已提交
1376 1377
	int err;

1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388
	switch (event->attr.type) {
	case PERF_TYPE_RAW:
	case PERF_TYPE_HARDWARE:
	case PERF_TYPE_HW_CACHE:
		break;

	default:
		return -ENOENT;
	}

	err = __x86_pmu_event_init(event);
1389
	if (!err) {
1390 1391 1392 1393 1394 1395 1396 1397
		/*
		 * we temporarily connect event to its pmu
		 * such that validate_group() can classify
		 * it as an x86 event using is_x86_event()
		 */
		tmp = event->pmu;
		event->pmu = &pmu;

1398 1399
		if (event->group_leader != event)
			err = validate_group(event);
1400 1401
		else
			err = validate_event(event);
1402 1403

		event->pmu = tmp;
1404
	}
1405
	if (err) {
1406 1407
		if (event->destroy)
			event->destroy(event);
1408
	}
I
Ingo Molnar 已提交
1409

1410
	return err;
I
Ingo Molnar 已提交
1411
}
1412

1413
static struct pmu pmu = {
P
Peter Zijlstra 已提交
1414 1415 1416
	.pmu_enable	= x86_pmu_enable,
	.pmu_disable	= x86_pmu_disable,

1417
	.event_init	= x86_pmu_event_init,
P
Peter Zijlstra 已提交
1418 1419 1420

	.add		= x86_pmu_add,
	.del		= x86_pmu_del,
1421 1422 1423
	.start		= x86_pmu_start,
	.stop		= x86_pmu_stop,
	.read		= x86_pmu_read,
P
Peter Zijlstra 已提交
1424

1425 1426 1427 1428 1429
	.start_txn	= x86_pmu_start_txn,
	.cancel_txn	= x86_pmu_cancel_txn,
	.commit_txn	= x86_pmu_commit_txn,
};

1430 1431 1432 1433 1434 1435
/*
 * callchain support
 */

static int backtrace_stack(void *data, char *name)
{
1436
	return 0;
1437 1438 1439 1440 1441 1442
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

1443
	perf_callchain_store(entry, addr);
1444 1445 1446 1447 1448
}

static const struct stacktrace_ops backtrace_ops = {
	.stack			= backtrace_stack,
	.address		= backtrace_address,
1449
	.walk_stack		= print_context_stack_bp,
1450 1451
};

1452 1453
void
perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1454
{
1455 1456
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
1457
		return;
1458 1459
	}

1460
	perf_callchain_store(entry, regs->ip);
1461

1462
	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1463 1464
}

1465 1466 1467
#ifdef CONFIG_COMPAT
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1468
{
1469 1470 1471
	/* 32-bit process in 64-bit kernel. */
	struct stack_frame_ia32 frame;
	const void __user *fp;
1472

1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484
	if (!test_thread_flag(TIF_IA32))
		return 0;

	fp = compat_ptr(regs->bp);
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
		unsigned long bytes;
		frame.next_frame     = 0;
		frame.return_address = 0;

		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
			break;
1485

1486 1487
		if (fp < compat_ptr(regs->sp))
			break;
1488

1489
		perf_callchain_store(entry, frame.return_address);
1490 1491 1492
		fp = compat_ptr(frame.next_frame);
	}
	return 1;
1493
}
1494 1495 1496 1497 1498 1499 1500
#else
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
    return 0;
}
#endif
1501

1502 1503
void
perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1504 1505 1506 1507
{
	struct stack_frame frame;
	const void __user *fp;

1508 1509
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
1510
		return;
1511
	}
1512

1513
	fp = (void __user *)regs->bp;
1514

1515
	perf_callchain_store(entry, regs->ip);
1516

1517 1518 1519
	if (!current->mm)
		return;

1520 1521 1522
	if (perf_callchain_user32(regs, entry))
		return;

1523
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
1524
		unsigned long bytes;
1525
		frame.next_frame	     = NULL;
1526 1527
		frame.return_address = 0;

1528 1529
		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
1530 1531
			break;

1532
		if ((unsigned long)fp < regs->sp)
1533 1534
			break;

1535
		perf_callchain_store(entry, frame.return_address);
1536
		fp = frame.next_frame;
1537 1538 1539
	}
}

1540 1541 1542
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
	unsigned long ip;
1543

1544 1545 1546 1547
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
		ip = perf_guest_cbs->get_guest_ip();
	else
		ip = instruction_pointer(regs);
1548

1549 1550 1551 1552 1553 1554
	return ip;
}

unsigned long perf_misc_flags(struct pt_regs *regs)
{
	int misc = 0;
1555

1556
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567
		if (perf_guest_cbs->is_user_mode())
			misc |= PERF_RECORD_MISC_GUEST_USER;
		else
			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
	} else {
		if (user_mode(regs))
			misc |= PERF_RECORD_MISC_USER;
		else
			misc |= PERF_RECORD_MISC_KERNEL;
	}

1568
	if (regs->flags & PERF_EFLAGS_EXACT)
P
Peter Zijlstra 已提交
1569
		misc |= PERF_RECORD_MISC_EXACT_IP;
1570 1571 1572

	return misc;
}