perf_event.c 36.5 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2
 * Performance events x86 architecture code
I
Ingo Molnar 已提交
3
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
I
Ingo Molnar 已提交
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
I
Ingo Molnar 已提交
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
#include <linux/module.h>
I
Ingo Molnar 已提交
21 22
#include <linux/kdebug.h>
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/slab.h>
25
#include <linux/cpu.h>
26
#include <linux/bitops.h>
I
Ingo Molnar 已提交
27 28

#include <asm/apic.h>
29
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
30
#include <asm/nmi.h>
31
#include <asm/compat.h>
32
#include <asm/smp.h>
33
#include <asm/alternative.h>
I
Ingo Molnar 已提交
34

35 36
#include "perf_event.h"

37 38 39 40 41 42 43 44 45 46 47
#if 0
#undef wrmsrl
#define wrmsrl(msr, val) 					\
do {								\
	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
			(unsigned long)(val));			\
	native_write_msr((msr), (u32)((u64)(val)), 		\
			(u32)((u64)(val) >> 32));		\
} while (0)
#endif

48
struct x86_pmu x86_pmu __read_mostly;
49

50
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
51 52
	.enabled = 1,
};
I
Ingo Molnar 已提交
53

54
u64 __read_mostly hw_cache_event_ids
55 56 57
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
58
u64 __read_mostly hw_cache_extra_regs
59 60 61
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
62

63
/*
64 65
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
66 67
 * Returns the delta events processed.
 */
68
u64 x86_perf_event_update(struct perf_event *event)
69
{
70
	struct hw_perf_event *hwc = &event->hw;
71
	int shift = 64 - x86_pmu.cntval_bits;
72
	u64 prev_raw_count, new_raw_count;
73
	int idx = hwc->idx;
74
	s64 delta;
75

76 77 78
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

79
	/*
80
	 * Careful: an NMI might modify the previous event value.
81 82 83
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
84
	 * count to the generic event atomically:
85 86
	 */
again:
87
	prev_raw_count = local64_read(&hwc->prev_count);
88
	rdmsrl(hwc->event_base, new_raw_count);
89

90
	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
91 92 93 94 95 96
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
97
	 * (event-)time and add that to the generic event.
98 99
	 *
	 * Careful, not all hw sign-extends above the physical width
100
	 * of the count.
101
	 */
102 103
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
104

105 106
	local64_add(delta, &event->count);
	local64_sub(delta, &hwc->period_left);
107 108

	return new_raw_count;
109 110
}

111 112 113 114 115
/*
 * Find and validate any extra registers to set up.
 */
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
116
	struct hw_perf_event_extra *reg;
117 118
	struct extra_reg *er;

119
	reg = &event->hw.extra_reg;
120 121 122 123 124 125 126 127 128

	if (!x86_pmu.extra_regs)
		return 0;

	for (er = x86_pmu.extra_regs; er->msr; er++) {
		if (er->event != (config & er->config_mask))
			continue;
		if (event->attr.config1 & ~er->valid_mask)
			return -EINVAL;
129 130 131 132

		reg->idx = er->idx;
		reg->config = event->attr.config1;
		reg->reg = er->msr;
133 134 135 136 137
		break;
	}
	return 0;
}

138
static atomic_t active_events;
P
Peter Zijlstra 已提交
139 140
static DEFINE_MUTEX(pmc_reserve_mutex);

141 142
#ifdef CONFIG_X86_LOCAL_APIC

P
Peter Zijlstra 已提交
143 144 145 146
static bool reserve_pmc_hardware(void)
{
	int i;

147
	for (i = 0; i < x86_pmu.num_counters; i++) {
148
		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
P
Peter Zijlstra 已提交
149 150 151
			goto perfctr_fail;
	}

152
	for (i = 0; i < x86_pmu.num_counters; i++) {
153
		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
P
Peter Zijlstra 已提交
154 155 156 157 158 159 160
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
161
		release_evntsel_nmi(x86_pmu_config_addr(i));
P
Peter Zijlstra 已提交
162

163
	i = x86_pmu.num_counters;
P
Peter Zijlstra 已提交
164 165 166

perfctr_fail:
	for (i--; i >= 0; i--)
167
		release_perfctr_nmi(x86_pmu_event_addr(i));
P
Peter Zijlstra 已提交
168 169 170 171 172 173 174 175

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

176
	for (i = 0; i < x86_pmu.num_counters; i++) {
177 178
		release_perfctr_nmi(x86_pmu_event_addr(i));
		release_evntsel_nmi(x86_pmu_config_addr(i));
P
Peter Zijlstra 已提交
179 180 181
	}
}

182 183 184 185 186 187 188
#else

static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}

#endif

189 190 191
static bool check_hw_exists(void)
{
	u64 val, val_new = 0;
192
	int i, reg, ret = 0;
193

194 195 196 197 198
	/*
	 * Check to see if the BIOS enabled any of the counters, if so
	 * complain and bail.
	 */
	for (i = 0; i < x86_pmu.num_counters; i++) {
199
		reg = x86_pmu_config_addr(i);
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
			goto bios_fail;
	}

	if (x86_pmu.num_counters_fixed) {
		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
			if (val & (0x03 << i*4))
				goto bios_fail;
		}
	}

	/*
	 * Now write a value and read it back to see if it matches,
	 * this is needed to detect certain hardware emulators (qemu/kvm)
	 * that don't trap on the MSR access and always return 0s.
	 */
223
	val = 0xabcdUL;
224 225
	ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
	ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
226
	if (ret || val != val_new)
227
		goto msr_fail;
228 229

	return true;
230 231

bios_fail:
232 233 234 235
	/*
	 * We still allow the PMU driver to operate:
	 */
	printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
236
	printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
237 238

	return true;
239 240 241

msr_fail:
	printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
242

243
	return false;
244 245
}

246
static void hw_perf_event_destroy(struct perf_event *event)
P
Peter Zijlstra 已提交
247
{
248
	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
249
		release_pmc_hardware();
250
		release_ds_buffers();
P
Peter Zijlstra 已提交
251 252 253 254
		mutex_unlock(&pmc_reserve_mutex);
	}
}

255 256 257 258 259
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

260
static inline int
261
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
262
{
263
	struct perf_event_attr *attr = &event->attr;
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;
290 291
	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
	return x86_pmu_extra_regs(val, event);
292 293
}

294
int x86_setup_perfctr(struct perf_event *event)
295 296 297 298 299
{
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
	u64 config;

300
	if (!is_sampling_event(event)) {
301 302
		hwc->sample_period = x86_pmu.max_period;
		hwc->last_period = hwc->sample_period;
303
		local64_set(&hwc->period_left, hwc->sample_period);
304 305 306 307 308 309 310 311 312 313 314
	} else {
		/*
		 * If we have a PMU initialized but no APIC
		 * interrupts, we cannot sample hardware
		 * events (user-space has to fall back and
		 * sample via a hrtimer based software event):
		 */
		if (!x86_pmu.apic)
			return -EOPNOTSUPP;
	}

315 316 317 318
	/*
	 * Do not allow config1 (extended registers) to propagate,
	 * there's no sane user-space generalization yet:
	 */
319
	if (attr->type == PERF_TYPE_RAW)
320
		return 0;
321 322

	if (attr->type == PERF_TYPE_HW_CACHE)
323
		return set_ext_hw_attr(hwc, event);
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;

	/*
	 * The generic map:
	 */
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

	/*
	 * Branch tracing:
	 */
P
Peter Zijlstra 已提交
342 343
	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
	    !attr->freq && hwc->sample_period == 1) {
344
		/* BTS is not supported by this architecture. */
345
		if (!x86_pmu.bts_active)
346 347 348 349 350 351 352 353 354 355 356
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (!attr->exclude_kernel)
			return -EOPNOTSUPP;
	}

	hwc->config |= config;

	return 0;
}
357

358
int x86_pmu_hw_config(struct perf_event *event)
359
{
P
Peter Zijlstra 已提交
360 361 362 363
	if (event->attr.precise_ip) {
		int precise = 0;

		/* Support for constant skid */
364
		if (x86_pmu.pebs_active) {
P
Peter Zijlstra 已提交
365 366
			precise++;

367 368 369 370
			/* Support for IP fixup */
			if (x86_pmu.lbr_nr)
				precise++;
		}
P
Peter Zijlstra 已提交
371 372 373 374 375

		if (event->attr.precise_ip > precise)
			return -EOPNOTSUPP;
	}

376 377 378 379
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
380
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
381 382 383 384

	/*
	 * Count user and OS events unless requested not to
	 */
385 386 387 388
	if (!event->attr.exclude_user)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!event->attr.exclude_kernel)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
389

390 391
	if (event->attr.type == PERF_TYPE_RAW)
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
392

393
	return x86_setup_perfctr(event);
394 395
}

I
Ingo Molnar 已提交
396
/*
397
 * Setup the hardware configuration for a given attr_type
I
Ingo Molnar 已提交
398
 */
399
static int __x86_pmu_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
400
{
P
Peter Zijlstra 已提交
401
	int err;
I
Ingo Molnar 已提交
402

403 404
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
405

P
Peter Zijlstra 已提交
406
	err = 0;
407
	if (!atomic_inc_not_zero(&active_events)) {
P
Peter Zijlstra 已提交
408
		mutex_lock(&pmc_reserve_mutex);
409
		if (atomic_read(&active_events) == 0) {
410 411
			if (!reserve_pmc_hardware())
				err = -EBUSY;
412 413
			else
				reserve_ds_buffers();
414 415
		}
		if (!err)
416
			atomic_inc(&active_events);
P
Peter Zijlstra 已提交
417 418 419 420 421
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

422
	event->destroy = hw_perf_event_destroy;
423

424 425 426
	event->hw.idx = -1;
	event->hw.last_cpu = -1;
	event->hw.last_tag = ~0ULL;
427

428 429 430
	/* mark unused */
	event->hw.extra_reg.idx = EXTRA_REG_NONE;

431
	return x86_pmu.hw_config(event);
432 433
}

434
void x86_pmu_disable_all(void)
435
{
436
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
437 438
	int idx;

439
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
440 441
		u64 val;

442
		if (!test_bit(idx, cpuc->active_mask))
443
			continue;
444
		rdmsrl(x86_pmu_config_addr(idx), val);
445
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
446
			continue;
447
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
448
		wrmsrl(x86_pmu_config_addr(idx), val);
449 450 451
	}
}

P
Peter Zijlstra 已提交
452
static void x86_pmu_disable(struct pmu *pmu)
453
{
454 455
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

456
	if (!x86_pmu_initialized())
457
		return;
458

459 460 461 462 463 464
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
465 466

	x86_pmu.disable_all();
467
}
I
Ingo Molnar 已提交
468

469
void x86_pmu_enable_all(int added)
470
{
471
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
472 473
	int idx;

474
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
475
		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
476

477
		if (!test_bit(idx, cpuc->active_mask))
478
			continue;
479

480
		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
481 482 483
	}
}

P
Peter Zijlstra 已提交
484
static struct pmu pmu;
485 486 487 488 489 490

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

491
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
492
{
493
	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
494
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
495
	int i, j, w, wmax, num = 0;
496 497 498 499 500
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

	for (i = 0; i < n; i++) {
501 502
		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
		constraints[i] = c;
503 504
	}

505 506 507
	/*
	 * fastpath, try to reuse previous register
	 */
508
	for (i = 0; i < n; i++) {
509
		hwc = &cpuc->event_list[i]->hw;
510
		c = constraints[i];
511 512 513 514 515 516

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
517
		if (!test_bit(hwc->idx, c->idxmsk))
518 519 520 521 522 523
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

P
Peter Zijlstra 已提交
524
		__set_bit(hwc->idx, used_mask);
525 526 527
		if (assign)
			assign[i] = hwc->idx;
	}
528
	if (i == n)
529 530 531 532 533 534 535 536
		goto done;

	/*
	 * begin slow path
	 */

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

537 538 539 540 541 542 543 544 545
	/*
	 * weight = number of possible counters
	 *
	 * 1    = most constrained, only works on one counter
	 * wmax = least constrained, works on any counter
	 *
	 * assign events to counters starting with most
	 * constrained events.
	 */
546
	wmax = x86_pmu.num_counters;
547 548 549 550 551 552

	/*
	 * when fixed event counters are present,
	 * wmax is incremented by 1 to account
	 * for one more choice
	 */
553
	if (x86_pmu.num_counters_fixed)
554 555
		wmax++;

556
	for (w = 1, num = n; num && w <= wmax; w++) {
557
		/* for each event */
558
		for (i = 0; num && i < n; i++) {
559
			c = constraints[i];
560 561
			hwc = &cpuc->event_list[i]->hw;

562
			if (c->weight != w)
563 564
				continue;

565
			for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
566 567 568 569 570 571 572
				if (!test_bit(j, used_mask))
					break;
			}

			if (j == X86_PMC_IDX_MAX)
				break;

P
Peter Zijlstra 已提交
573
			__set_bit(j, used_mask);
574

575 576 577 578 579
			if (assign)
				assign[i] = j;
			num--;
		}
	}
580
done:
581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
	/*
	 * scheduling failed or is just a simulation,
	 * free resources if necessary
	 */
	if (!assign || num) {
		for (i = 0; i < n; i++) {
			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
		}
	}
	return num ? -ENOSPC : 0;
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

603
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
604 605 606 607 608 609 610 611 612 613 614 615 616 617 618

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
			return -ENOSPC;
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
619
		    event->state <= PERF_EVENT_STATE_OFF)
620 621 622 623 624 625 626 627 628 629 630 631
			continue;

		if (n >= max_count)
			return -ENOSPC;

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
632
				struct cpu_hw_events *cpuc, int i)
633
{
634 635 636 637 638
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
639 640 641 642 643 644

	if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
		hwc->config_base = 0;
		hwc->event_base	= 0;
	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
645
		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
646
	} else {
647 648
		hwc->config_base = x86_pmu_config_addr(hwc->idx);
		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
649 650 651
	}
}

652 653 654 655 656 657 658 659 660
static inline int match_prev_assignment(struct hw_perf_event *hwc,
					struct cpu_hw_events *cpuc,
					int i)
{
	return hwc->idx == cpuc->assign[i] &&
		hwc->last_cpu == smp_processor_id() &&
		hwc->last_tag == cpuc->tags[i];
}

P
Peter Zijlstra 已提交
661
static void x86_pmu_start(struct perf_event *event, int flags);
662

P
Peter Zijlstra 已提交
663
static void x86_pmu_enable(struct pmu *pmu)
664
{
665 666 667
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	struct perf_event *event;
	struct hw_perf_event *hwc;
668
	int i, added = cpuc->n_added;
669

670
	if (!x86_pmu_initialized())
671
		return;
672 673 674 675

	if (cpuc->enabled)
		return;

676
	if (cpuc->n_added) {
677
		int n_running = cpuc->n_events - cpuc->n_added;
678 679 680 681 682 683 684
		/*
		 * apply assignment obtained either from
		 * hw_perf_group_sched_in() or x86_pmu_enable()
		 *
		 * step1: save events moving to new counters
		 * step2: reprogram moved events into new counters
		 */
685
		for (i = 0; i < n_running; i++) {
686 687 688
			event = cpuc->event_list[i];
			hwc = &event->hw;

689 690 691 692 693 694 695 696
			/*
			 * we can avoid reprogramming counter if:
			 * - assigned same counter as last time
			 * - running on same CPU as last time
			 * - no other event has used the counter since
			 */
			if (hwc->idx == -1 ||
			    match_prev_assignment(hwc, cpuc, i))
697 698
				continue;

P
Peter Zijlstra 已提交
699 700 701 702 703 704 705 706
			/*
			 * Ensure we don't accidentally enable a stopped
			 * counter simply because we rescheduled.
			 */
			if (hwc->state & PERF_HES_STOPPED)
				hwc->state |= PERF_HES_ARCH;

			x86_pmu_stop(event, PERF_EF_UPDATE);
707 708 709 710 711 712
		}

		for (i = 0; i < cpuc->n_events; i++) {
			event = cpuc->event_list[i];
			hwc = &event->hw;

713
			if (!match_prev_assignment(hwc, cpuc, i))
714
				x86_assign_hw_event(event, cpuc, i);
715 716
			else if (i < n_running)
				continue;
717

P
Peter Zijlstra 已提交
718 719 720 721
			if (hwc->state & PERF_HES_ARCH)
				continue;

			x86_pmu_start(event, PERF_EF_RELOAD);
722 723 724 725
		}
		cpuc->n_added = 0;
		perf_events_lapic_init();
	}
726 727 728 729

	cpuc->enabled = 1;
	barrier();

730
	x86_pmu.enable_all(added);
731 732
}

733
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
I
Ingo Molnar 已提交
734

735 736
/*
 * Set the next IRQ period, based on the hwc->period_left value.
737
 * To be called with the event disabled in hw:
738
 */
739
int x86_perf_event_set_period(struct perf_event *event)
I
Ingo Molnar 已提交
740
{
741
	struct hw_perf_event *hwc = &event->hw;
742
	s64 left = local64_read(&hwc->period_left);
743
	s64 period = hwc->sample_period;
744
	int ret = 0, idx = hwc->idx;
745

746 747 748
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

749
	/*
750
	 * If we are way outside a reasonable range then just skip forward:
751 752 753
	 */
	if (unlikely(left <= -period)) {
		left = period;
754
		local64_set(&hwc->period_left, left);
755
		hwc->last_period = period;
756
		ret = 1;
757 758 759 760
	}

	if (unlikely(left <= 0)) {
		left += period;
761
		local64_set(&hwc->period_left, left);
762
		hwc->last_period = period;
763
		ret = 1;
764
	}
765
	/*
766
	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
767 768 769
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
770

771 772 773
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

774
	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
775 776

	/*
777
	 * The hw event starts counting from this event offset,
778 779
	 * mark it to be able to extra future deltas:
	 */
780
	local64_set(&hwc->prev_count, (u64)-left);
781

782
	wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
783 784 785 786 787 788 789

	/*
	 * Due to erratum on certan cpu we need
	 * a second write to be sure the register
	 * is updated properly
	 */
	if (x86_pmu.perfctr_second_write) {
790
		wrmsrl(hwc->event_base,
791
			(u64)(-left) & x86_pmu.cntval_mask);
792
	}
793

794
	perf_event_update_userpage(event);
795

796
	return ret;
797 798
}

799
void x86_pmu_enable_event(struct perf_event *event)
800
{
T
Tejun Heo 已提交
801
	if (__this_cpu_read(cpu_hw_events.enabled))
802 803
		__x86_pmu_enable_event(&event->hw,
				       ARCH_PERFMON_EVENTSEL_ENABLE);
I
Ingo Molnar 已提交
804 805
}

806
/*
P
Peter Zijlstra 已提交
807
 * Add a single event to the PMU.
808 809 810
 *
 * The event is added to the group of enabled events
 * but only if it can be scehduled with existing events.
811
 */
P
Peter Zijlstra 已提交
812
static int x86_pmu_add(struct perf_event *event, int flags)
813 814
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
815 816 817
	struct hw_perf_event *hwc;
	int assign[X86_PMC_IDX_MAX];
	int n, n0, ret;
818

819
	hwc = &event->hw;
820

P
Peter Zijlstra 已提交
821
	perf_pmu_disable(event->pmu);
822
	n0 = cpuc->n_events;
823 824 825
	ret = n = collect_events(cpuc, event, false);
	if (ret < 0)
		goto out;
826

P
Peter Zijlstra 已提交
827 828 829 830
	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
	if (!(flags & PERF_EF_START))
		hwc->state |= PERF_HES_ARCH;

831 832
	/*
	 * If group events scheduling transaction was started,
L
Lucas De Marchi 已提交
833
	 * skip the schedulability test here, it will be performed
P
Peter Zijlstra 已提交
834
	 * at commit time (->commit_txn) as a whole
835
	 */
836
	if (cpuc->group_flag & PERF_EVENT_TXN)
837
		goto done_collect;
838

839
	ret = x86_pmu.schedule_events(cpuc, n, assign);
840
	if (ret)
841
		goto out;
842 843 844 845 846
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));
847

848
done_collect:
849
	cpuc->n_events = n;
850
	cpuc->n_added += n - n0;
851
	cpuc->n_txn += n - n0;
852

853 854
	ret = 0;
out:
P
Peter Zijlstra 已提交
855
	perf_pmu_enable(event->pmu);
856
	return ret;
I
Ingo Molnar 已提交
857 858
}

P
Peter Zijlstra 已提交
859
static void x86_pmu_start(struct perf_event *event, int flags)
860
{
P
Peter Zijlstra 已提交
861 862 863
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int idx = event->hw.idx;

P
Peter Zijlstra 已提交
864 865 866 867 868 869 870 871 872 873 874 875
	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
		return;

	if (WARN_ON_ONCE(idx == -1))
		return;

	if (flags & PERF_EF_RELOAD) {
		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
		x86_perf_event_set_period(event);
	}

	event->hw.state = 0;
876

P
Peter Zijlstra 已提交
877 878
	cpuc->events[idx] = event;
	__set_bit(idx, cpuc->active_mask);
879
	__set_bit(idx, cpuc->running);
880
	x86_pmu.enable(event);
P
Peter Zijlstra 已提交
881
	perf_event_update_userpage(event);
882 883
}

884
void perf_event_print_debug(void)
I
Ingo Molnar 已提交
885
{
886
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
887
	u64 pebs;
888
	struct cpu_hw_events *cpuc;
889
	unsigned long flags;
890 891
	int cpu, idx;

892
	if (!x86_pmu.num_counters)
893
		return;
I
Ingo Molnar 已提交
894

895
	local_irq_save(flags);
I
Ingo Molnar 已提交
896 897

	cpu = smp_processor_id();
898
	cpuc = &per_cpu(cpu_hw_events, cpu);
I
Ingo Molnar 已提交
899

900
	if (x86_pmu.version >= 2) {
901 902 903 904
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
905
		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
906 907 908 909 910 911

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
912
		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
913
	}
914
	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
I
Ingo Molnar 已提交
915

916
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
917 918
		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
I
Ingo Molnar 已提交
919

920
		prev_left = per_cpu(pmc_prev_left[idx], cpu);
I
Ingo Molnar 已提交
921

922
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
923
			cpu, idx, pmc_ctrl);
924
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
925
			cpu, idx, pmc_count);
926
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
927
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
928
	}
929
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
930 931
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

932
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
933 934
			cpu, idx, pmc_count);
	}
935
	local_irq_restore(flags);
I
Ingo Molnar 已提交
936 937
}

938
void x86_pmu_stop(struct perf_event *event, int flags)
I
Ingo Molnar 已提交
939
{
940
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
941
	struct hw_perf_event *hwc = &event->hw;
I
Ingo Molnar 已提交
942

P
Peter Zijlstra 已提交
943 944 945 946 947 948
	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
		x86_pmu.disable(event);
		cpuc->events[hwc->idx] = NULL;
		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
		hwc->state |= PERF_HES_STOPPED;
	}
949

P
Peter Zijlstra 已提交
950 951 952 953 954 955 956 957
	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
		/*
		 * Drain the remaining delta count out of a event
		 * that we are disabling:
		 */
		x86_perf_event_update(event);
		hwc->state |= PERF_HES_UPTODATE;
	}
958 959
}

P
Peter Zijlstra 已提交
960
static void x86_pmu_del(struct perf_event *event, int flags)
961 962 963 964
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int i;

965 966 967 968 969
	/*
	 * If we're called during a txn, we don't need to do anything.
	 * The events never got scheduled and ->cancel_txn will truncate
	 * the event_list.
	 */
970
	if (cpuc->group_flag & PERF_EVENT_TXN)
971 972
		return;

P
Peter Zijlstra 已提交
973
	x86_pmu_stop(event, PERF_EF_UPDATE);
974

975 976 977 978 979 980 981 982 983 984
	for (i = 0; i < cpuc->n_events; i++) {
		if (event == cpuc->event_list[i]) {

			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, event);

			while (++i < cpuc->n_events)
				cpuc->event_list[i-1] = cpuc->event_list[i];

			--cpuc->n_events;
985
			break;
986 987
		}
	}
988
	perf_event_update_userpage(event);
I
Ingo Molnar 已提交
989 990
}

991
int x86_pmu_handle_irq(struct pt_regs *regs)
992
{
993
	struct perf_sample_data data;
994 995
	struct cpu_hw_events *cpuc;
	struct perf_event *event;
V
Vince Weaver 已提交
996
	int idx, handled = 0;
997 998
	u64 val;

999
	perf_sample_data_init(&data, 0);
1000

1001
	cpuc = &__get_cpu_var(cpu_hw_events);
1002

1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
	/*
	 * Some chipsets need to unmask the LVTPC in a particular spot
	 * inside the nmi handler.  As a result, the unmasking was pushed
	 * into all the nmi handlers.
	 *
	 * This generic handler doesn't seem to have any issues where the
	 * unmasking occurs so it was left at the top.
	 */
	apic_write(APIC_LVTPC, APIC_DM_NMI);

1013
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1014 1015 1016 1017 1018 1019 1020 1021
		if (!test_bit(idx, cpuc->active_mask)) {
			/*
			 * Though we deactivated the counter some cpus
			 * might still deliver spurious interrupts still
			 * in flight. Catch them:
			 */
			if (__test_and_clear_bit(idx, cpuc->running))
				handled++;
1022
			continue;
1023
		}
1024

1025
		event = cpuc->events[idx];
1026

1027
		val = x86_perf_event_update(event);
1028
		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1029
			continue;
1030

1031
		/*
1032
		 * event overflow
1033
		 */
1034
		handled++;
1035
		data.period	= event->hw.last_period;
1036

1037
		if (!x86_perf_event_set_period(event))
1038 1039
			continue;

1040
		if (perf_event_overflow(event, &data, regs))
P
Peter Zijlstra 已提交
1041
			x86_pmu_stop(event, 0);
1042
	}
1043

1044 1045 1046
	if (handled)
		inc_irq_stat(apic_perf_irqs);

1047 1048
	return handled;
}
1049

1050
void perf_events_lapic_init(void)
I
Ingo Molnar 已提交
1051
{
1052
	if (!x86_pmu.apic || !x86_pmu_initialized())
I
Ingo Molnar 已提交
1053
		return;
1054

I
Ingo Molnar 已提交
1055
	/*
1056
	 * Always use NMI for PMU
I
Ingo Molnar 已提交
1057
	 */
1058
	apic_write(APIC_LVTPC, APIC_DM_NMI);
I
Ingo Molnar 已提交
1059 1060
}

1061 1062 1063 1064 1065 1066 1067
struct pmu_nmi_state {
	unsigned int	marked;
	int		handled;
};

static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);

I
Ingo Molnar 已提交
1068
static int __kprobes
1069
perf_event_nmi_handler(struct notifier_block *self,
I
Ingo Molnar 已提交
1070 1071 1072
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
1073 1074
	unsigned int this_nmi;
	int handled;
1075

1076
	if (!atomic_read(&active_events))
1077 1078
		return NOTIFY_DONE;

1079 1080 1081
	switch (cmd) {
	case DIE_NMI:
		break;
1082 1083
	case DIE_NMIUNKNOWN:
		this_nmi = percpu_read(irq_stat.__nmi_count);
T
Tejun Heo 已提交
1084
		if (this_nmi != __this_cpu_read(pmu_nmi.marked))
1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
			/* let the kernel handle the unknown nmi */
			return NOTIFY_DONE;
		/*
		 * This one is a PMU back-to-back nmi. Two events
		 * trigger 'simultaneously' raising two back-to-back
		 * NMIs. If the first NMI handles both, the latter
		 * will be empty and daze the CPU. So, we drop it to
		 * avoid false-positive 'unknown nmi' messages.
		 */
		return NOTIFY_STOP;
1095
	default:
I
Ingo Molnar 已提交
1096
		return NOTIFY_DONE;
1097
	}
I
Ingo Molnar 已提交
1098

1099 1100 1101 1102 1103 1104 1105
	handled = x86_pmu.handle_irq(args->regs);
	if (!handled)
		return NOTIFY_DONE;

	this_nmi = percpu_read(irq_stat.__nmi_count);
	if ((handled > 1) ||
		/* the next nmi could be a back-to-back nmi */
T
Tejun Heo 已提交
1106 1107
	    ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
	     (__this_cpu_read(pmu_nmi.handled) > 1))) {
1108 1109 1110 1111 1112 1113 1114 1115 1116 1117
		/*
		 * We could have two subsequent back-to-back nmis: The
		 * first handles more than one counter, the 2nd
		 * handles only one counter and the 3rd handles no
		 * counter.
		 *
		 * This is the 2nd nmi because the previous was
		 * handling more than one counter. We will mark the
		 * next (3rd) and then drop it if unhandled.
		 */
T
Tejun Heo 已提交
1118 1119
		__this_cpu_write(pmu_nmi.marked, this_nmi + 1);
		__this_cpu_write(pmu_nmi.handled, handled);
1120
	}
I
Ingo Molnar 已提交
1121

1122
	return NOTIFY_STOP;
I
Ingo Molnar 已提交
1123 1124
}

1125 1126 1127
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
	.notifier_call		= perf_event_nmi_handler,
	.next			= NULL,
1128
	.priority		= NMI_LOCAL_LOW_PRIOR,
1129 1130
};

1131 1132
struct event_constraint emptyconstraint;
struct event_constraint unconstrained;
1133

1134 1135 1136 1137
static int __cpuinit
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;
1138
	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1139
	int ret = NOTIFY_OK;
1140 1141 1142

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
1143
		cpuc->kfree_on_online = NULL;
1144
		if (x86_pmu.cpu_prepare)
1145
			ret = x86_pmu.cpu_prepare(cpu);
1146 1147 1148 1149 1150 1151 1152
		break;

	case CPU_STARTING:
		if (x86_pmu.cpu_starting)
			x86_pmu.cpu_starting(cpu);
		break;

1153 1154 1155 1156
	case CPU_ONLINE:
		kfree(cpuc->kfree_on_online);
		break;

1157 1158 1159 1160 1161
	case CPU_DYING:
		if (x86_pmu.cpu_dying)
			x86_pmu.cpu_dying(cpu);
		break;

1162
	case CPU_UP_CANCELED:
1163 1164 1165 1166 1167 1168 1169 1170 1171
	case CPU_DEAD:
		if (x86_pmu.cpu_dead)
			x86_pmu.cpu_dead(cpu);
		break;

	default:
		break;
	}

1172
	return ret;
1173 1174
}

1175 1176 1177 1178 1179 1180 1181 1182 1183 1184
static void __init pmu_check_apic(void)
{
	if (cpu_has_apic)
		return;

	x86_pmu.apic = 0;
	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
	pr_info("no hardware sampling interrupt available.\n");
}

1185
static int __init init_hw_perf_events(void)
1186
{
1187
	struct event_constraint *c;
1188 1189
	int err;

1190
	pr_info("Performance Events: ");
1191

1192 1193
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1194
		err = intel_pmu_init();
1195
		break;
1196
	case X86_VENDOR_AMD:
1197
		err = amd_pmu_init();
1198
		break;
1199
	default:
1200
		return 0;
1201
	}
1202
	if (err != 0) {
1203
		pr_cont("no PMU driver, software events only.\n");
1204
		return 0;
1205
	}
1206

1207 1208
	pmu_check_apic();

1209
	/* sanity check that the hardware exists or is emulated */
1210
	if (!check_hw_exists())
1211
		return 0;
1212

1213
	pr_cont("%s PMU driver.\n", x86_pmu.name);
1214

1215 1216 1217
	if (x86_pmu.quirks)
		x86_pmu.quirks();

1218
	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1219
		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1220 1221
		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
1222
	}
1223
	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
I
Ingo Molnar 已提交
1224

1225
	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1226
		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1227 1228
		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1229
	}
1230

1231
	x86_pmu.intel_ctrl |=
1232
		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
1233

1234 1235
	perf_events_lapic_init();
	register_die_notifier(&perf_event_nmi_notifier);
1236

1237
	unconstrained = (struct event_constraint)
1238 1239
		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
				   0, x86_pmu.num_counters);
1240

1241 1242
	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1243
			if (c->cmask != X86_RAW_EVENT_MASK)
1244 1245
				continue;

1246 1247
			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
			c->weight += x86_pmu.num_counters;
1248 1249 1250
		}
	}

I
Ingo Molnar 已提交
1251
	pr_info("... version:                %d\n",     x86_pmu.version);
1252 1253 1254
	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
I
Ingo Molnar 已提交
1255
	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1256
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1257
	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1258

P
Peter Zijlstra 已提交
1259
	perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1260
	perf_cpu_notifier(x86_pmu_notifier);
1261 1262

	return 0;
I
Ingo Molnar 已提交
1263
}
1264
early_initcall(init_hw_perf_events);
I
Ingo Molnar 已提交
1265

1266
static inline void x86_pmu_read(struct perf_event *event)
1267
{
1268
	x86_perf_event_update(event);
1269 1270
}

1271 1272 1273 1274 1275
/*
 * Start group events scheduling transaction
 * Set the flag to make pmu::enable() not perform the
 * schedulability test, it will be performed at commit time
 */
P
Peter Zijlstra 已提交
1276
static void x86_pmu_start_txn(struct pmu *pmu)
1277
{
P
Peter Zijlstra 已提交
1278
	perf_pmu_disable(pmu);
T
Tejun Heo 已提交
1279 1280
	__this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
	__this_cpu_write(cpu_hw_events.n_txn, 0);
1281 1282 1283 1284 1285 1286 1287
}

/*
 * Stop group events scheduling transaction
 * Clear the flag and pmu::enable() will perform the
 * schedulability test.
 */
P
Peter Zijlstra 已提交
1288
static void x86_pmu_cancel_txn(struct pmu *pmu)
1289
{
T
Tejun Heo 已提交
1290
	__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1291 1292 1293
	/*
	 * Truncate the collected events.
	 */
T
Tejun Heo 已提交
1294 1295
	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
P
Peter Zijlstra 已提交
1296
	perf_pmu_enable(pmu);
1297 1298 1299 1300 1301 1302 1303
}

/*
 * Commit group events scheduling transaction
 * Perform the group schedulability test as a whole
 * Return 0 if success
 */
P
Peter Zijlstra 已提交
1304
static int x86_pmu_commit_txn(struct pmu *pmu)
1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int assign[X86_PMC_IDX_MAX];
	int n, ret;

	n = cpuc->n_events;

	if (!x86_pmu_initialized())
		return -EAGAIN;

	ret = x86_pmu.schedule_events(cpuc, n, assign);
	if (ret)
		return ret;

	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));

1325
	cpuc->group_flag &= ~PERF_EVENT_TXN;
P
Peter Zijlstra 已提交
1326
	perf_pmu_enable(pmu);
1327 1328
	return 0;
}
1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
/*
 * a fake_cpuc is used to validate event groups. Due to
 * the extra reg logic, we need to also allocate a fake
 * per_core and per_cpu structure. Otherwise, group events
 * using extra reg may conflict without the kernel being
 * able to catch this when the last event gets added to
 * the group.
 */
static void free_fake_cpuc(struct cpu_hw_events *cpuc)
{
	kfree(cpuc->shared_regs);
	kfree(cpuc);
}

static struct cpu_hw_events *allocate_fake_cpuc(void)
{
	struct cpu_hw_events *cpuc;
	int cpu = raw_smp_processor_id();

	cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
	if (!cpuc)
		return ERR_PTR(-ENOMEM);

	/* only needed, if we have extra_regs */
	if (x86_pmu.extra_regs) {
		cpuc->shared_regs = allocate_shared_regs(cpu);
		if (!cpuc->shared_regs)
			goto error;
	}
	return cpuc;
error:
	free_fake_cpuc(cpuc);
	return ERR_PTR(-ENOMEM);
}
1363

1364 1365 1366 1367 1368 1369 1370 1371 1372
/*
 * validate that we can schedule this event
 */
static int validate_event(struct perf_event *event)
{
	struct cpu_hw_events *fake_cpuc;
	struct event_constraint *c;
	int ret = 0;

1373 1374 1375
	fake_cpuc = allocate_fake_cpuc();
	if (IS_ERR(fake_cpuc))
		return PTR_ERR(fake_cpuc);
1376 1377 1378 1379 1380 1381 1382 1383 1384

	c = x86_pmu.get_event_constraints(fake_cpuc, event);

	if (!c || !c->weight)
		ret = -ENOSPC;

	if (x86_pmu.put_event_constraints)
		x86_pmu.put_event_constraints(fake_cpuc, event);

1385
	free_fake_cpuc(fake_cpuc);
1386 1387 1388 1389

	return ret;
}

1390 1391 1392 1393
/*
 * validate a single event group
 *
 * validation include:
1394 1395 1396
 *	- check events are compatible which each other
 *	- events do not compete for the same counter
 *	- number of events <= number of counters
1397 1398 1399 1400
 *
 * validation ensures the group can be loaded onto the
 * PMU if it was the only group available.
 */
1401 1402
static int validate_group(struct perf_event *event)
{
1403
	struct perf_event *leader = event->group_leader;
1404
	struct cpu_hw_events *fake_cpuc;
1405
	int ret = -ENOSPC, n;
1406

1407 1408 1409
	fake_cpuc = allocate_fake_cpuc();
	if (IS_ERR(fake_cpuc))
		return PTR_ERR(fake_cpuc);
1410 1411 1412 1413 1414 1415
	/*
	 * the event is not yet connected with its
	 * siblings therefore we must first collect
	 * existing siblings, then add the new event
	 * before we can simulate the scheduling
	 */
1416
	n = collect_events(fake_cpuc, leader, true);
1417
	if (n < 0)
1418
		goto out;
1419

1420 1421
	fake_cpuc->n_events = n;
	n = collect_events(fake_cpuc, event, false);
1422
	if (n < 0)
1423
		goto out;
1424

1425
	fake_cpuc->n_events = n;
1426

1427
	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1428 1429

out:
1430
	free_fake_cpuc(fake_cpuc);
1431
	return ret;
1432 1433
}

1434
static int x86_pmu_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
1435
{
P
Peter Zijlstra 已提交
1436
	struct pmu *tmp;
I
Ingo Molnar 已提交
1437 1438
	int err;

1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449
	switch (event->attr.type) {
	case PERF_TYPE_RAW:
	case PERF_TYPE_HARDWARE:
	case PERF_TYPE_HW_CACHE:
		break;

	default:
		return -ENOENT;
	}

	err = __x86_pmu_event_init(event);
1450
	if (!err) {
1451 1452 1453 1454 1455 1456 1457 1458
		/*
		 * we temporarily connect event to its pmu
		 * such that validate_group() can classify
		 * it as an x86 event using is_x86_event()
		 */
		tmp = event->pmu;
		event->pmu = &pmu;

1459 1460
		if (event->group_leader != event)
			err = validate_group(event);
1461 1462
		else
			err = validate_event(event);
1463 1464

		event->pmu = tmp;
1465
	}
1466
	if (err) {
1467 1468
		if (event->destroy)
			event->destroy(event);
1469
	}
I
Ingo Molnar 已提交
1470

1471
	return err;
I
Ingo Molnar 已提交
1472
}
1473

1474
static struct pmu pmu = {
P
Peter Zijlstra 已提交
1475 1476 1477
	.pmu_enable	= x86_pmu_enable,
	.pmu_disable	= x86_pmu_disable,

1478
	.event_init	= x86_pmu_event_init,
P
Peter Zijlstra 已提交
1479 1480 1481

	.add		= x86_pmu_add,
	.del		= x86_pmu_del,
1482 1483 1484
	.start		= x86_pmu_start,
	.stop		= x86_pmu_stop,
	.read		= x86_pmu_read,
P
Peter Zijlstra 已提交
1485

1486 1487 1488 1489 1490
	.start_txn	= x86_pmu_start_txn,
	.cancel_txn	= x86_pmu_cancel_txn,
	.commit_txn	= x86_pmu_commit_txn,
};

1491 1492 1493 1494 1495 1496
/*
 * callchain support
 */

static int backtrace_stack(void *data, char *name)
{
1497
	return 0;
1498 1499 1500 1501 1502 1503
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

1504
	perf_callchain_store(entry, addr);
1505 1506 1507 1508 1509
}

static const struct stacktrace_ops backtrace_ops = {
	.stack			= backtrace_stack,
	.address		= backtrace_address,
1510
	.walk_stack		= print_context_stack_bp,
1511 1512
};

1513 1514
void
perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1515
{
1516 1517
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
1518
		return;
1519 1520
	}

1521
	perf_callchain_store(entry, regs->ip);
1522

1523
	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1524 1525
}

1526 1527 1528
#ifdef CONFIG_COMPAT
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1529
{
1530 1531 1532
	/* 32-bit process in 64-bit kernel. */
	struct stack_frame_ia32 frame;
	const void __user *fp;
1533

1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545
	if (!test_thread_flag(TIF_IA32))
		return 0;

	fp = compat_ptr(regs->bp);
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
		unsigned long bytes;
		frame.next_frame     = 0;
		frame.return_address = 0;

		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
			break;
1546

1547 1548
		if (fp < compat_ptr(regs->sp))
			break;
1549

1550
		perf_callchain_store(entry, frame.return_address);
1551 1552 1553
		fp = compat_ptr(frame.next_frame);
	}
	return 1;
1554
}
1555 1556 1557 1558 1559 1560 1561
#else
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
    return 0;
}
#endif
1562

1563 1564
void
perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1565 1566 1567 1568
{
	struct stack_frame frame;
	const void __user *fp;

1569 1570
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
1571
		return;
1572
	}
1573

1574
	fp = (void __user *)regs->bp;
1575

1576
	perf_callchain_store(entry, regs->ip);
1577

1578 1579 1580
	if (!current->mm)
		return;

1581 1582 1583
	if (perf_callchain_user32(regs, entry))
		return;

1584
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
1585
		unsigned long bytes;
1586
		frame.next_frame	     = NULL;
1587 1588
		frame.return_address = 0;

1589 1590
		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
1591 1592
			break;

1593
		if ((unsigned long)fp < regs->sp)
1594 1595
			break;

1596
		perf_callchain_store(entry, frame.return_address);
1597
		fp = frame.next_frame;
1598 1599 1600
	}
}

1601 1602 1603
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
	unsigned long ip;
1604

1605 1606 1607 1608
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
		ip = perf_guest_cbs->get_guest_ip();
	else
		ip = instruction_pointer(regs);
1609

1610 1611 1612 1613 1614 1615
	return ip;
}

unsigned long perf_misc_flags(struct pt_regs *regs)
{
	int misc = 0;
1616

1617
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628
		if (perf_guest_cbs->is_user_mode())
			misc |= PERF_RECORD_MISC_GUEST_USER;
		else
			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
	} else {
		if (user_mode(regs))
			misc |= PERF_RECORD_MISC_USER;
		else
			misc |= PERF_RECORD_MISC_KERNEL;
	}

1629
	if (regs->flags & PERF_EFLAGS_EXACT)
P
Peter Zijlstra 已提交
1630
		misc |= PERF_RECORD_MISC_EXACT_IP;
1631 1632 1633

	return misc;
}