perf_event.c 36.4 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2
 * Performance events x86 architecture code
I
Ingo Molnar 已提交
3
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
I
Ingo Molnar 已提交
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
I
Ingo Molnar 已提交
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
#include <linux/module.h>
I
Ingo Molnar 已提交
21 22
#include <linux/kdebug.h>
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/highmem.h>
25
#include <linux/cpu.h>
26
#include <linux/bitops.h>
I
Ingo Molnar 已提交
27 28

#include <asm/apic.h>
29
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
30
#include <asm/nmi.h>
I
Ingo Molnar 已提交
31

32 33 34 35 36 37 38 39 40 41 42
#if 0
#undef wrmsrl
#define wrmsrl(msr, val) 					\
do {								\
	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
			(unsigned long)(val));			\
	native_write_msr((msr), (u32)((u64)(val)), 		\
			(u32)((u64)(val) >> 32));		\
} while (0)
#endif

43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
/*
 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
 */
static unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
{
	unsigned long offset, addr = (unsigned long)from;
	int type = in_nmi() ? KM_NMI : KM_IRQ0;
	unsigned long size, len = 0;
	struct page *page;
	void *map;
	int ret;

	do {
		ret = __get_user_pages_fast(addr, 1, 0, &page);
		if (!ret)
			break;

		offset = addr & (PAGE_SIZE - 1);
		size = min(PAGE_SIZE - offset, n - len);

		map = kmap_atomic(page, type);
		memcpy(to, map+offset, size);
		kunmap_atomic(map, type);
		put_page(page);

		len  += size;
		to   += size;
		addr += size;

	} while (len < n);

	return len;
}

78
static u64 perf_event_mask __read_mostly;
79

80
struct event_constraint {
81 82
	union {
		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
83
		u64		idxmsk64;
84
	};
85 86
	u64	code;
	u64	cmask;
87
	int	weight;
88 89
};

90 91 92 93 94 95 96
struct amd_nb {
	int nb_id;  /* NorthBridge id */
	int refcnt; /* reference count */
	struct perf_event *owners[X86_PMC_IDX_MAX];
	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};

97 98
#define MAX_LBR_ENTRIES		16

99
struct cpu_hw_events {
100 101 102
	/*
	 * Generic x86 PMC bits
	 */
103
	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
104
	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
105
	unsigned long		interrupts;
106
	int			enabled;
I
Ingo Molnar 已提交
107

108 109 110
	int			n_events;
	int			n_added;
	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
111
	u64			tags[X86_PMC_IDX_MAX];
112
	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
113 114 115 116 117 118 119

	/*
	 * Intel DebugStore bits
	 */
	struct debug_store	*ds;
	u64			pebs_enabled;

120 121 122 123 124 125 126 127
	/*
	 * Intel LBR bits
	 */
	int				lbr_users;
	void				*lbr_context;
	struct perf_branch_stack	lbr_stack;
	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];

128 129 130
	/*
	 * AMD specific bits
	 */
131
	struct amd_nb		*amd_nb;
132 133
};

134
#define __EVENT_CONSTRAINT(c, n, m, w) {\
135
	{ .idxmsk64 = (n) },		\
136 137
	.code = (c),			\
	.cmask = (m),			\
138
	.weight = (w),			\
139
}
140

141 142 143
#define EVENT_CONSTRAINT(c, n, m)	\
	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))

144 145 146
/*
 * Constraint on the Event code.
 */
147 148
#define INTEL_EVENT_CONSTRAINT(c, n)	\
	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
149

150 151 152
/*
 * Constraint on the Event code + UMask + fixed-mask
 */
153
#define FIXED_EVENT_CONSTRAINT(c, n)	\
154
	EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
155

156 157 158 159 160 161
/*
 * Constraint on the Event code + UMask
 */
#define PEBS_EVENT_CONSTRAINT(c, n)	\
	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)

162 163 164 165 166
#define EVENT_CONSTRAINT_END		\
	EVENT_CONSTRAINT(0, 0, 0)

#define for_each_event_constraint(e, c)	\
	for ((e) = (c); (e)->cmask; (e)++)
167

168 169 170 171 172 173 174 175 176 177 178
union perf_capabilities {
	struct {
		u64	lbr_format    : 6;
		u64	pebs_trap     : 1;
		u64	pebs_arch_reg : 1;
		u64	pebs_format   : 4;
		u64	smm_freeze    : 1;
	};
	u64	capabilities;
};

I
Ingo Molnar 已提交
179
/*
180
 * struct x86_pmu - generic x86 pmu
I
Ingo Molnar 已提交
181
 */
182
struct x86_pmu {
183 184 185
	/*
	 * Generic x86 PMC bits
	 */
186 187
	const char	*name;
	int		version;
188
	int		(*handle_irq)(struct pt_regs *);
189 190
	void		(*disable_all)(void);
	void		(*enable_all)(void);
191 192
	void		(*enable)(struct perf_event *);
	void		(*disable)(struct perf_event *);
193 194
	unsigned	eventsel;
	unsigned	perfctr;
195 196
	u64		(*event_map)(int);
	u64		(*raw_event)(u64);
197
	int		max_events;
198 199 200 201
	int		num_events;
	int		num_events_fixed;
	int		event_bits;
	u64		event_mask;
202
	int		apic;
203
	u64		max_period;
204 205 206 207
	struct event_constraint *
			(*get_event_constraints)(struct cpu_hw_events *cpuc,
						 struct perf_event *event);

208 209
	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
						 struct perf_event *event);
210
	struct event_constraint *event_constraints;
211
	void		(*quirks)(void);
212 213 214 215 216

	void		(*cpu_prepare)(int cpu);
	void		(*cpu_starting)(int cpu);
	void		(*cpu_dying)(int cpu);
	void		(*cpu_dead)(int cpu);
217 218 219 220

	/*
	 * Intel Arch Perfmon v2+
	 */
221 222
	u64			intel_ctrl;
	union perf_capabilities intel_cap;
223 224 225 226 227 228 229 230

	/*
	 * Intel DebugStore bits
	 */
	int		bts, pebs;
	int		pebs_record_size;
	void		(*drain_pebs)(struct pt_regs *regs);
	struct event_constraint *pebs_constraints;
231 232 233 234 235 236

	/*
	 * Intel LBR
	 */
	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
	int		lbr_nr;			   /* hardware stack size */
237 238
};

239
static struct x86_pmu x86_pmu __read_mostly;
240

241
static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
242 243
	.enabled = 1,
};
I
Ingo Molnar 已提交
244

245
static int x86_perf_event_set_period(struct perf_event *event);
246

247
/*
248
 * Generalized hw caching related hw_event table, filled
249
 * in on a per model basis. A value of 0 means
250 251
 * 'not supported', -1 means 'hw_event makes no sense on
 * this CPU', any other value means the raw hw_event
252 253 254 255 256 257 258 259 260 261
 * ID.
 */

#define C(x) PERF_COUNT_HW_CACHE_##x

static u64 __read_mostly hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];

262
/*
263 264
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
265 266
 * Returns the delta events processed.
 */
267
static u64
268
x86_perf_event_update(struct perf_event *event)
269
{
270
	struct hw_perf_event *hwc = &event->hw;
271
	int shift = 64 - x86_pmu.event_bits;
272
	u64 prev_raw_count, new_raw_count;
273
	int idx = hwc->idx;
274
	s64 delta;
275

276 277 278
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

279
	/*
280
	 * Careful: an NMI might modify the previous event value.
281 282 283
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
284
	 * count to the generic event atomically:
285 286 287
	 */
again:
	prev_raw_count = atomic64_read(&hwc->prev_count);
288
	rdmsrl(hwc->event_base + idx, new_raw_count);
289 290 291 292 293 294 295 296

	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
297
	 * (event-)time and add that to the generic event.
298 299
	 *
	 * Careful, not all hw sign-extends above the physical width
300
	 * of the count.
301
	 */
302 303
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
304

305
	atomic64_add(delta, &event->count);
306
	atomic64_sub(delta, &hwc->period_left);
307 308

	return new_raw_count;
309 310
}

311
static atomic_t active_events;
P
Peter Zijlstra 已提交
312 313 314 315
static DEFINE_MUTEX(pmc_reserve_mutex);

static bool reserve_pmc_hardware(void)
{
316
#ifdef CONFIG_X86_LOCAL_APIC
P
Peter Zijlstra 已提交
317 318 319 320 321
	int i;

	if (nmi_watchdog == NMI_LOCAL_APIC)
		disable_lapic_nmi_watchdog();

322
	for (i = 0; i < x86_pmu.num_events; i++) {
323
		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
P
Peter Zijlstra 已提交
324 325 326
			goto perfctr_fail;
	}

327
	for (i = 0; i < x86_pmu.num_events; i++) {
328
		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
P
Peter Zijlstra 已提交
329 330
			goto eventsel_fail;
	}
331
#endif
P
Peter Zijlstra 已提交
332 333 334

	return true;

335
#ifdef CONFIG_X86_LOCAL_APIC
P
Peter Zijlstra 已提交
336 337
eventsel_fail:
	for (i--; i >= 0; i--)
338
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
339

340
	i = x86_pmu.num_events;
P
Peter Zijlstra 已提交
341 342 343

perfctr_fail:
	for (i--; i >= 0; i--)
344
		release_perfctr_nmi(x86_pmu.perfctr + i);
P
Peter Zijlstra 已提交
345 346 347 348 349

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();

	return false;
350
#endif
P
Peter Zijlstra 已提交
351 352 353 354
}

static void release_pmc_hardware(void)
{
355
#ifdef CONFIG_X86_LOCAL_APIC
P
Peter Zijlstra 已提交
356 357
	int i;

358
	for (i = 0; i < x86_pmu.num_events; i++) {
359 360
		release_perfctr_nmi(x86_pmu.perfctr + i);
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
361 362 363 364
	}

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();
365
#endif
P
Peter Zijlstra 已提交
366 367
}

368 369
static int reserve_ds_buffers(void);
static void release_ds_buffers(void);
370

371
static void hw_perf_event_destroy(struct perf_event *event)
P
Peter Zijlstra 已提交
372
{
373
	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
374
		release_pmc_hardware();
375
		release_ds_buffers();
P
Peter Zijlstra 已提交
376 377 378 379
		mutex_unlock(&pmc_reserve_mutex);
	}
}

380 381 382 383 384
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

385
static inline int
386
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417
{
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;

	return 0;
}

I
Ingo Molnar 已提交
418
/*
419
 * Setup the hardware configuration for a given attr_type
I
Ingo Molnar 已提交
420
 */
421
static int __hw_perf_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
422
{
423 424
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
425
	u64 config;
P
Peter Zijlstra 已提交
426
	int err;
I
Ingo Molnar 已提交
427

428 429
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
430

P
Peter Zijlstra 已提交
431
	err = 0;
432
	if (!atomic_inc_not_zero(&active_events)) {
P
Peter Zijlstra 已提交
433
		mutex_lock(&pmc_reserve_mutex);
434
		if (atomic_read(&active_events) == 0) {
435 436 437
			if (!reserve_pmc_hardware())
				err = -EBUSY;
			else
438
				err = reserve_ds_buffers();
439 440
		}
		if (!err)
441
			atomic_inc(&active_events);
P
Peter Zijlstra 已提交
442 443 444 445 446
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

447
	event->destroy = hw_perf_event_destroy;
448

I
Ingo Molnar 已提交
449
	/*
450
	 * Generate PMC IRQs:
I
Ingo Molnar 已提交
451 452
	 * (keep 'enabled' bit clear for now)
	 */
453
	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
I
Ingo Molnar 已提交
454

455
	hwc->idx = -1;
456 457
	hwc->last_cpu = -1;
	hwc->last_tag = ~0ULL;
458

I
Ingo Molnar 已提交
459
	/*
460
	 * Count user and OS events unless requested not to.
I
Ingo Molnar 已提交
461
	 */
462
	if (!attr->exclude_user)
463
		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
464
	if (!attr->exclude_kernel)
I
Ingo Molnar 已提交
465
		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
466

467
	if (!hwc->sample_period) {
468
		hwc->sample_period = x86_pmu.max_period;
469
		hwc->last_period = hwc->sample_period;
470
		atomic64_set(&hwc->period_left, hwc->sample_period);
471 472 473 474
	} else {
		/*
		 * If we have a PMU initialized but no APIC
		 * interrupts, we cannot sample hardware
475 476
		 * events (user-space has to fall back and
		 * sample via a hrtimer based software event):
477 478 479
		 */
		if (!x86_pmu.apic)
			return -EOPNOTSUPP;
480
	}
481

I
Ingo Molnar 已提交
482
	/*
483
	 * Raw hw_event type provide the config in the hw_event structure
I
Ingo Molnar 已提交
484
	 */
485 486
	if (attr->type == PERF_TYPE_RAW) {
		hwc->config |= x86_pmu.raw_event(attr->config);
487 488 489
		if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
		    perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
			return -EACCES;
490
		return 0;
I
Ingo Molnar 已提交
491 492
	}

493 494 495 496 497
	if (attr->type == PERF_TYPE_HW_CACHE)
		return set_ext_hw_attr(hwc, attr);

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;
498

499 500 501
	/*
	 * The generic map:
	 */
502 503 504 505 506 507 508 509
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

510 511 512 513
	/*
	 * Branch tracing:
	 */
	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
514 515
	    (hwc->sample_period == 1)) {
		/* BTS is not supported by this architecture. */
516
		if (!x86_pmu.bts)
517 518 519 520 521 522
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
			return -EOPNOTSUPP;
	}
523

524
	hwc->config |= config;
P
Peter Zijlstra 已提交
525

I
Ingo Molnar 已提交
526 527 528
	return 0;
}

529
static void x86_pmu_disable_all(void)
530
{
531
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
532 533
	int idx;

534
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
535 536
		u64 val;

537
		if (!test_bit(idx, cpuc->active_mask))
538
			continue;
539
		rdmsrl(x86_pmu.eventsel + idx, val);
540
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
541
			continue;
542
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
543
		wrmsrl(x86_pmu.eventsel + idx, val);
544 545 546
	}
}

547
void hw_perf_disable(void)
548
{
549 550
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

551
	if (!x86_pmu_initialized())
552
		return;
553

554 555 556 557 558 559
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
560 561

	x86_pmu.disable_all();
562
}
I
Ingo Molnar 已提交
563

564
static void x86_pmu_enable_all(void)
565
{
566
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
567 568
	int idx;

569 570
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
		struct perf_event *event = cpuc->events[idx];
571
		u64 val;
572

573
		if (!test_bit(idx, cpuc->active_mask))
574
			continue;
575

576
		val = event->hw.config;
577
		val |= ARCH_PERFMON_EVENTSEL_ENABLE;
578
		wrmsrl(x86_pmu.eventsel + idx, val);
579 580 581
	}
}

582 583 584 585 586 587 588 589 590
static const struct pmu pmu;

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
591
	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
592
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
593
	int i, j, w, wmax, num = 0;
594 595 596 597 598
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

	for (i = 0; i < n; i++) {
599 600
		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
		constraints[i] = c;
601 602
	}

603 604 605
	/*
	 * fastpath, try to reuse previous register
	 */
606
	for (i = 0; i < n; i++) {
607
		hwc = &cpuc->event_list[i]->hw;
608
		c = constraints[i];
609 610 611 612 613 614

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
615
		if (!test_bit(hwc->idx, c->idxmsk))
616 617 618 619 620 621
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

P
Peter Zijlstra 已提交
622
		__set_bit(hwc->idx, used_mask);
623 624 625
		if (assign)
			assign[i] = hwc->idx;
	}
626
	if (i == n)
627 628 629 630 631 632 633 634
		goto done;

	/*
	 * begin slow path
	 */

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653
	/*
	 * weight = number of possible counters
	 *
	 * 1    = most constrained, only works on one counter
	 * wmax = least constrained, works on any counter
	 *
	 * assign events to counters starting with most
	 * constrained events.
	 */
	wmax = x86_pmu.num_events;

	/*
	 * when fixed event counters are present,
	 * wmax is incremented by 1 to account
	 * for one more choice
	 */
	if (x86_pmu.num_events_fixed)
		wmax++;

654
	for (w = 1, num = n; num && w <= wmax; w++) {
655
		/* for each event */
656
		for (i = 0; num && i < n; i++) {
657
			c = constraints[i];
658 659
			hwc = &cpuc->event_list[i]->hw;

660
			if (c->weight != w)
661 662
				continue;

663
			for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
664 665 666 667 668 669 670
				if (!test_bit(j, used_mask))
					break;
			}

			if (j == X86_PMC_IDX_MAX)
				break;

P
Peter Zijlstra 已提交
671
			__set_bit(j, used_mask);
672

673 674 675 676 677
			if (assign)
				assign[i] = j;
			num--;
		}
	}
678
done:
679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716
	/*
	 * scheduling failed or is just a simulation,
	 * free resources if necessary
	 */
	if (!assign || num) {
		for (i = 0; i < n; i++) {
			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
		}
	}
	return num ? -ENOSPC : 0;
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

	max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
			return -ENOSPC;
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
717
		    event->state <= PERF_EVENT_STATE_OFF)
718 719 720 721 722 723 724 725 726 727 728 729
			continue;

		if (n >= max_count)
			return -ENOSPC;

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
730
				struct cpu_hw_events *cpuc, int i)
731
{
732 733 734 735 736
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754

	if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
		hwc->config_base = 0;
		hwc->event_base	= 0;
	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		/*
		 * We set it so that event_base + idx in wrmsr/rdmsr maps to
		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
		 */
		hwc->event_base =
			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
	} else {
		hwc->config_base = x86_pmu.eventsel;
		hwc->event_base  = x86_pmu.perfctr;
	}
}

755 756 757 758 759 760 761 762 763
static inline int match_prev_assignment(struct hw_perf_event *hwc,
					struct cpu_hw_events *cpuc,
					int i)
{
	return hwc->idx == cpuc->assign[i] &&
		hwc->last_cpu == smp_processor_id() &&
		hwc->last_tag == cpuc->tags[i];
}

P
Peter Zijlstra 已提交
764
static int x86_pmu_start(struct perf_event *event);
765
static void x86_pmu_stop(struct perf_event *event);
766

767
void hw_perf_enable(void)
768
{
769 770 771 772 773
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	struct perf_event *event;
	struct hw_perf_event *hwc;
	int i;

774
	if (!x86_pmu_initialized())
775
		return;
776 777 778 779

	if (cpuc->enabled)
		return;

780
	if (cpuc->n_added) {
781
		int n_running = cpuc->n_events - cpuc->n_added;
782 783 784 785 786 787 788
		/*
		 * apply assignment obtained either from
		 * hw_perf_group_sched_in() or x86_pmu_enable()
		 *
		 * step1: save events moving to new counters
		 * step2: reprogram moved events into new counters
		 */
789
		for (i = 0; i < n_running; i++) {
790 791 792 793

			event = cpuc->event_list[i];
			hwc = &event->hw;

794 795 796 797 798 799 800 801
			/*
			 * we can avoid reprogramming counter if:
			 * - assigned same counter as last time
			 * - running on same CPU as last time
			 * - no other event has used the counter since
			 */
			if (hwc->idx == -1 ||
			    match_prev_assignment(hwc, cpuc, i))
802 803
				continue;

804
			x86_pmu_stop(event);
805 806 807 808 809 810 811 812 813

			hwc->idx = -1;
		}

		for (i = 0; i < cpuc->n_events; i++) {

			event = cpuc->event_list[i];
			hwc = &event->hw;

814 815 816 817
			if (i < n_running &&
			    match_prev_assignment(hwc, cpuc, i))
				continue;

P
Peter Zijlstra 已提交
818
			if (hwc->idx == -1)
819
				x86_assign_hw_event(event, cpuc, i);
820

P
Peter Zijlstra 已提交
821
			x86_pmu_start(event);
822 823 824 825
		}
		cpuc->n_added = 0;
		perf_events_lapic_init();
	}
826 827 828 829

	cpuc->enabled = 1;
	barrier();

830
	x86_pmu.enable_all();
831 832
}

833
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
834
{
835
	wrmsrl(hwc->config_base + hwc->idx,
836
			      hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
837 838
}

839
static inline void x86_pmu_disable_event(struct perf_event *event)
840
{
841
	struct hw_perf_event *hwc = &event->hw;
842 843

	wrmsrl(hwc->config_base + hwc->idx, hwc->config);
844 845
}

846
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
I
Ingo Molnar 已提交
847

848 849
/*
 * Set the next IRQ period, based on the hwc->period_left value.
850
 * To be called with the event disabled in hw:
851
 */
852
static int
853
x86_perf_event_set_period(struct perf_event *event)
I
Ingo Molnar 已提交
854
{
855
	struct hw_perf_event *hwc = &event->hw;
856
	s64 left = atomic64_read(&hwc->period_left);
857
	s64 period = hwc->sample_period;
858
	int ret = 0, idx = hwc->idx;
859

860 861 862
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

863
	/*
864
	 * If we are way outside a reasonable range then just skip forward:
865 866 867 868
	 */
	if (unlikely(left <= -period)) {
		left = period;
		atomic64_set(&hwc->period_left, left);
869
		hwc->last_period = period;
870
		ret = 1;
871 872 873 874 875
	}

	if (unlikely(left <= 0)) {
		left += period;
		atomic64_set(&hwc->period_left, left);
876
		hwc->last_period = period;
877
		ret = 1;
878
	}
879
	/*
880
	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
881 882 883
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
884

885 886 887
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

888
	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
889 890

	/*
891
	 * The hw event starts counting from this event offset,
892 893
	 * mark it to be able to extra future deltas:
	 */
894
	atomic64_set(&hwc->prev_count, (u64)-left);
895

896 897
	wrmsrl(hwc->event_base + idx,
			(u64)(-left) & x86_pmu.event_mask);
898

899
	perf_event_update_userpage(event);
900

901
	return ret;
902 903
}

904
static void x86_pmu_enable_event(struct perf_event *event)
905
{
906
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
907
	if (cpuc->enabled)
908
		__x86_pmu_enable_event(&event->hw);
I
Ingo Molnar 已提交
909 910
}

911
/*
912 913 914 915 916 917 918
 * activate a single event
 *
 * The event is added to the group of enabled events
 * but only if it can be scehduled with existing events.
 *
 * Called with PMU disabled. If successful and return value 1,
 * then guaranteed to call perf_enable() and hw_perf_enable()
919 920 921 922
 */
static int x86_pmu_enable(struct perf_event *event)
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
923 924 925
	struct hw_perf_event *hwc;
	int assign[X86_PMC_IDX_MAX];
	int n, n0, ret;
926

927
	hwc = &event->hw;
928

929 930 931 932
	n0 = cpuc->n_events;
	n = collect_events(cpuc, event, false);
	if (n < 0)
		return n;
933

934 935 936 937 938 939 940 941
	ret = x86_schedule_events(cpuc, n, assign);
	if (ret)
		return ret;
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));
942

943
	cpuc->n_events = n;
944
	cpuc->n_added += n - n0;
945 946

	return 0;
I
Ingo Molnar 已提交
947 948
}

949 950
static int x86_pmu_start(struct perf_event *event)
{
P
Peter Zijlstra 已提交
951 952 953 954
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int idx = event->hw.idx;

	if (idx == -1)
955 956
		return -EAGAIN;

957
	x86_perf_event_set_period(event);
P
Peter Zijlstra 已提交
958 959
	cpuc->events[idx] = event;
	__set_bit(idx, cpuc->active_mask);
960
	x86_pmu.enable(event);
P
Peter Zijlstra 已提交
961
	perf_event_update_userpage(event);
962 963 964 965

	return 0;
}

966
static void x86_pmu_unthrottle(struct perf_event *event)
967
{
968 969
	int ret = x86_pmu_start(event);
	WARN_ON_ONCE(ret);
970 971
}

972
void perf_event_print_debug(void)
I
Ingo Molnar 已提交
973
{
974
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
975
	u64 pebs;
976
	struct cpu_hw_events *cpuc;
977
	unsigned long flags;
978 979
	int cpu, idx;

980
	if (!x86_pmu.num_events)
981
		return;
I
Ingo Molnar 已提交
982

983
	local_irq_save(flags);
I
Ingo Molnar 已提交
984 985

	cpu = smp_processor_id();
986
	cpuc = &per_cpu(cpu_hw_events, cpu);
I
Ingo Molnar 已提交
987

988
	if (x86_pmu.version >= 2) {
989 990 991 992
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
993
		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
994 995 996 997 998 999

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1000
		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1001
	}
1002
	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
I
Ingo Molnar 已提交
1003

1004
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
1005 1006
		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
I
Ingo Molnar 已提交
1007

1008
		prev_left = per_cpu(pmc_prev_left[idx], cpu);
I
Ingo Molnar 已提交
1009

1010
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
1011
			cpu, idx, pmc_ctrl);
1012
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
1013
			cpu, idx, pmc_count);
1014
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1015
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
1016
	}
1017
	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1018 1019
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

1020
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1021 1022
			cpu, idx, pmc_count);
	}
1023
	local_irq_restore(flags);
I
Ingo Molnar 已提交
1024 1025
}

1026
static void x86_pmu_stop(struct perf_event *event)
I
Ingo Molnar 已提交
1027
{
1028
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1029
	struct hw_perf_event *hwc = &event->hw;
1030
	int idx = hwc->idx;
I
Ingo Molnar 已提交
1031

1032 1033 1034
	if (!__test_and_clear_bit(idx, cpuc->active_mask))
		return;

1035
	x86_pmu.disable(event);
I
Ingo Molnar 已提交
1036

1037
	/*
1038
	 * Drain the remaining delta count out of a event
1039 1040
	 * that we are disabling:
	 */
1041
	x86_perf_event_update(event);
1042

1043
	cpuc->events[idx] = NULL;
1044 1045 1046 1047 1048 1049 1050
}

static void x86_pmu_disable(struct perf_event *event)
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int i;

1051
	x86_pmu_stop(event);
1052

1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
	for (i = 0; i < cpuc->n_events; i++) {
		if (event == cpuc->event_list[i]) {

			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, event);

			while (++i < cpuc->n_events)
				cpuc->event_list[i-1] = cpuc->event_list[i];

			--cpuc->n_events;
1063
			break;
1064 1065
		}
	}
1066
	perf_event_update_userpage(event);
I
Ingo Molnar 已提交
1067 1068
}

1069
static int x86_pmu_handle_irq(struct pt_regs *regs)
1070
{
1071
	struct perf_sample_data data;
1072 1073 1074
	struct cpu_hw_events *cpuc;
	struct perf_event *event;
	struct hw_perf_event *hwc;
V
Vince Weaver 已提交
1075
	int idx, handled = 0;
1076 1077
	u64 val;

1078
	perf_sample_data_init(&data, 0);
1079

1080
	cpuc = &__get_cpu_var(cpu_hw_events);
1081

1082
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
1083
		if (!test_bit(idx, cpuc->active_mask))
1084
			continue;
1085

1086 1087
		event = cpuc->events[idx];
		hwc = &event->hw;
1088

1089
		val = x86_perf_event_update(event);
1090
		if (val & (1ULL << (x86_pmu.event_bits - 1)))
1091
			continue;
1092

1093
		/*
1094
		 * event overflow
1095 1096
		 */
		handled		= 1;
1097
		data.period	= event->hw.last_period;
1098

1099
		if (!x86_perf_event_set_period(event))
1100 1101
			continue;

1102
		if (perf_event_overflow(event, 1, &data, regs))
1103
			x86_pmu_stop(event);
1104
	}
1105

1106 1107 1108
	if (handled)
		inc_irq_stat(apic_perf_irqs);

1109 1110
	return handled;
}
1111

1112 1113 1114 1115 1116
void smp_perf_pending_interrupt(struct pt_regs *regs)
{
	irq_enter();
	ack_APIC_irq();
	inc_irq_stat(apic_pending_irqs);
1117
	perf_event_do_pending();
1118 1119 1120
	irq_exit();
}

1121
void set_perf_event_pending(void)
1122
{
1123
#ifdef CONFIG_X86_LOCAL_APIC
1124 1125 1126
	if (!x86_pmu.apic || !x86_pmu_initialized())
		return;

1127
	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1128
#endif
1129 1130
}

1131
void perf_events_lapic_init(void)
I
Ingo Molnar 已提交
1132
{
1133 1134
#ifdef CONFIG_X86_LOCAL_APIC
	if (!x86_pmu.apic || !x86_pmu_initialized())
I
Ingo Molnar 已提交
1135
		return;
1136

I
Ingo Molnar 已提交
1137
	/*
1138
	 * Always use NMI for PMU
I
Ingo Molnar 已提交
1139
	 */
1140
	apic_write(APIC_LVTPC, APIC_DM_NMI);
1141
#endif
I
Ingo Molnar 已提交
1142 1143 1144
}

static int __kprobes
1145
perf_event_nmi_handler(struct notifier_block *self,
I
Ingo Molnar 已提交
1146 1147 1148 1149
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
	struct pt_regs *regs;
1150

1151
	if (!atomic_read(&active_events))
1152 1153
		return NOTIFY_DONE;

1154 1155 1156 1157
	switch (cmd) {
	case DIE_NMI:
	case DIE_NMI_IPI:
		break;
I
Ingo Molnar 已提交
1158

1159
	default:
I
Ingo Molnar 已提交
1160
		return NOTIFY_DONE;
1161
	}
I
Ingo Molnar 已提交
1162 1163 1164

	regs = args->regs;

1165
#ifdef CONFIG_X86_LOCAL_APIC
I
Ingo Molnar 已提交
1166
	apic_write(APIC_LVTPC, APIC_DM_NMI);
1167
#endif
1168 1169
	/*
	 * Can't rely on the handled return value to say it was our NMI, two
1170
	 * events could trigger 'simultaneously' raising two back-to-back NMIs.
1171 1172 1173 1174
	 *
	 * If the first NMI handles both, the latter will be empty and daze
	 * the CPU.
	 */
1175
	x86_pmu.handle_irq(regs);
I
Ingo Molnar 已提交
1176

1177
	return NOTIFY_STOP;
I
Ingo Molnar 已提交
1178 1179
}

1180 1181 1182 1183 1184 1185
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
	.notifier_call		= perf_event_nmi_handler,
	.next			= NULL,
	.priority		= 1
};

1186
static struct event_constraint unconstrained;
1187
static struct event_constraint emptyconstraint;
1188 1189

static struct event_constraint *
1190
x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1191
{
1192
	struct event_constraint *c;
1193 1194 1195

	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1196 1197
			if ((event->hw.config & c->cmask) == c->code)
				return c;
1198 1199
		}
	}
1200 1201

	return &unconstrained;
1202 1203 1204
}

static int x86_event_sched_in(struct perf_event *event,
1205
			  struct perf_cpu_context *cpuctx)
1206 1207 1208 1209
{
	int ret = 0;

	event->state = PERF_EVENT_STATE_ACTIVE;
1210
	event->oncpu = smp_processor_id();
1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225
	event->tstamp_running += event->ctx->time - event->tstamp_stopped;

	if (!is_x86_event(event))
		ret = event->pmu->enable(event);

	if (!ret && !is_software_event(event))
		cpuctx->active_oncpu++;

	if (!ret && event->attr.exclusive)
		cpuctx->exclusive = 1;

	return ret;
}

static void x86_event_sched_out(struct perf_event *event,
1226
			    struct perf_cpu_context *cpuctx)
1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253
{
	event->state = PERF_EVENT_STATE_INACTIVE;
	event->oncpu = -1;

	if (!is_x86_event(event))
		event->pmu->disable(event);

	event->tstamp_running -= event->ctx->time - event->tstamp_stopped;

	if (!is_software_event(event))
		cpuctx->active_oncpu--;

	if (event->attr.exclusive || !cpuctx->active_oncpu)
		cpuctx->exclusive = 0;
}

/*
 * Called to enable a whole group of events.
 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
 * Assumes the caller has disabled interrupts and has
 * frozen the PMU with hw_perf_save_disable.
 *
 * called with PMU disabled. If successful and return value 1,
 * then guaranteed to call perf_enable() and hw_perf_enable()
 */
int hw_perf_group_sched_in(struct perf_event *leader,
	       struct perf_cpu_context *cpuctx,
1254
	       struct perf_event_context *ctx)
1255
{
1256
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269
	struct perf_event *sub;
	int assign[X86_PMC_IDX_MAX];
	int n0, n1, ret;

	/* n0 = total number of events */
	n0 = collect_events(cpuc, leader, true);
	if (n0 < 0)
		return n0;

	ret = x86_schedule_events(cpuc, n0, assign);
	if (ret)
		return ret;

1270
	ret = x86_event_sched_in(leader, cpuctx);
1271 1272 1273 1274 1275
	if (ret)
		return ret;

	n1 = 1;
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1276
		if (sub->state > PERF_EVENT_STATE_OFF) {
1277
			ret = x86_event_sched_in(sub, cpuctx);
1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289
			if (ret)
				goto undo;
			++n1;
		}
	}
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n0*sizeof(int));

	cpuc->n_events  = n0;
1290
	cpuc->n_added  += n1;
1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301
	ctx->nr_active += n1;

	/*
	 * 1 means successful and events are active
	 * This is not quite true because we defer
	 * actual activation until hw_perf_enable() but
	 * this way we* ensure caller won't try to enable
	 * individual events
	 */
	return 1;
undo:
1302
	x86_event_sched_out(leader, cpuctx);
1303 1304 1305
	n0  = 1;
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
1306
			x86_event_sched_out(sub, cpuctx);
1307 1308 1309 1310 1311 1312 1313
			if (++n0 == n1)
				break;
		}
	}
	return ret;
}

1314 1315
#include "perf_event_amd.c"
#include "perf_event_p6.c"
1316
#include "perf_event_intel_lbr.c"
1317
#include "perf_event_intel_ds.c"
1318
#include "perf_event_intel.c"
1319

1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352
static int __cpuinit
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
		if (x86_pmu.cpu_prepare)
			x86_pmu.cpu_prepare(cpu);
		break;

	case CPU_STARTING:
		if (x86_pmu.cpu_starting)
			x86_pmu.cpu_starting(cpu);
		break;

	case CPU_DYING:
		if (x86_pmu.cpu_dying)
			x86_pmu.cpu_dying(cpu);
		break;

	case CPU_DEAD:
		if (x86_pmu.cpu_dead)
			x86_pmu.cpu_dead(cpu);
		break;

	default:
		break;
	}

	return NOTIFY_OK;
}

1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
static void __init pmu_check_apic(void)
{
	if (cpu_has_apic)
		return;

	x86_pmu.apic = 0;
	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
	pr_info("no hardware sampling interrupt available.\n");
}

1363
void __init init_hw_perf_events(void)
1364
{
1365
	struct event_constraint *c;
1366 1367
	int err;

1368
	pr_info("Performance Events: ");
1369

1370 1371
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1372
		err = intel_pmu_init();
1373
		break;
1374
	case X86_VENDOR_AMD:
1375
		err = amd_pmu_init();
1376
		break;
1377 1378
	default:
		return;
1379
	}
1380
	if (err != 0) {
1381
		pr_cont("no PMU driver, software events only.\n");
1382
		return;
1383
	}
1384

1385 1386
	pmu_check_apic();

1387
	pr_cont("%s PMU driver.\n", x86_pmu.name);
1388

1389 1390 1391
	if (x86_pmu.quirks)
		x86_pmu.quirks();

1392 1393 1394 1395
	if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
		x86_pmu.num_events = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
1396
	}
1397 1398
	perf_event_mask = (1 << x86_pmu.num_events) - 1;
	perf_max_events = x86_pmu.num_events;
I
Ingo Molnar 已提交
1399

1400 1401 1402 1403
	if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
		     x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
		x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
1404
	}
1405

1406 1407 1408
	perf_event_mask |=
		((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
	x86_pmu.intel_ctrl = perf_event_mask;
I
Ingo Molnar 已提交
1409

1410 1411
	perf_events_lapic_init();
	register_die_notifier(&perf_event_nmi_notifier);
1412

1413
	unconstrained = (struct event_constraint)
1414 1415
		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
				   0, x86_pmu.num_events);
1416

1417 1418 1419 1420 1421 1422 1423 1424 1425 1426
	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
			if (c->cmask != INTEL_ARCH_FIXED_MASK)
				continue;

			c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
			c->weight += x86_pmu.num_events;
		}
	}

I
Ingo Molnar 已提交
1427 1428 1429 1430 1431 1432 1433
	pr_info("... version:                %d\n",     x86_pmu.version);
	pr_info("... bit width:              %d\n",     x86_pmu.event_bits);
	pr_info("... generic registers:      %d\n",     x86_pmu.num_events);
	pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask);
	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);
	pr_info("... event mask:             %016Lx\n", perf_event_mask);
1434 1435

	perf_cpu_notifier(x86_pmu_notifier);
I
Ingo Molnar 已提交
1436
}
I
Ingo Molnar 已提交
1437

1438
static inline void x86_pmu_read(struct perf_event *event)
1439
{
1440
	x86_perf_event_update(event);
1441 1442
}

1443 1444 1445
static const struct pmu pmu = {
	.enable		= x86_pmu_enable,
	.disable	= x86_pmu_disable,
1446 1447
	.start		= x86_pmu_start,
	.stop		= x86_pmu_stop,
1448
	.read		= x86_pmu_read,
1449
	.unthrottle	= x86_pmu_unthrottle,
I
Ingo Molnar 已提交
1450 1451
};

1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477
/*
 * validate that we can schedule this event
 */
static int validate_event(struct perf_event *event)
{
	struct cpu_hw_events *fake_cpuc;
	struct event_constraint *c;
	int ret = 0;

	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
	if (!fake_cpuc)
		return -ENOMEM;

	c = x86_pmu.get_event_constraints(fake_cpuc, event);

	if (!c || !c->weight)
		ret = -ENOSPC;

	if (x86_pmu.put_event_constraints)
		x86_pmu.put_event_constraints(fake_cpuc, event);

	kfree(fake_cpuc);

	return ret;
}

1478 1479 1480 1481
/*
 * validate a single event group
 *
 * validation include:
1482 1483 1484
 *	- check events are compatible which each other
 *	- events do not compete for the same counter
 *	- number of events <= number of counters
1485 1486 1487 1488
 *
 * validation ensures the group can be loaded onto the
 * PMU if it was the only group available.
 */
1489 1490
static int validate_group(struct perf_event *event)
{
1491
	struct perf_event *leader = event->group_leader;
1492 1493
	struct cpu_hw_events *fake_cpuc;
	int ret, n;
1494

1495 1496 1497 1498
	ret = -ENOMEM;
	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
	if (!fake_cpuc)
		goto out;
1499

1500 1501 1502 1503 1504 1505
	/*
	 * the event is not yet connected with its
	 * siblings therefore we must first collect
	 * existing siblings, then add the new event
	 * before we can simulate the scheduling
	 */
1506 1507
	ret = -ENOSPC;
	n = collect_events(fake_cpuc, leader, true);
1508
	if (n < 0)
1509
		goto out_free;
1510

1511 1512
	fake_cpuc->n_events = n;
	n = collect_events(fake_cpuc, event, false);
1513
	if (n < 0)
1514
		goto out_free;
1515

1516
	fake_cpuc->n_events = n;
1517

1518 1519 1520 1521 1522 1523
	ret = x86_schedule_events(fake_cpuc, n, NULL);

out_free:
	kfree(fake_cpuc);
out:
	return ret;
1524 1525
}

1526
const struct pmu *hw_perf_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
1527
{
1528
	const struct pmu *tmp;
I
Ingo Molnar 已提交
1529 1530
	int err;

1531
	err = __hw_perf_event_init(event);
1532
	if (!err) {
1533 1534 1535 1536 1537 1538 1539 1540
		/*
		 * we temporarily connect event to its pmu
		 * such that validate_group() can classify
		 * it as an x86 event using is_x86_event()
		 */
		tmp = event->pmu;
		event->pmu = &pmu;

1541 1542
		if (event->group_leader != event)
			err = validate_group(event);
1543 1544
		else
			err = validate_event(event);
1545 1546

		event->pmu = tmp;
1547
	}
1548
	if (err) {
1549 1550
		if (event->destroy)
			event->destroy(event);
1551
		return ERR_PTR(err);
1552
	}
I
Ingo Molnar 已提交
1553

1554
	return &pmu;
I
Ingo Molnar 已提交
1555
}
1556 1557 1558 1559 1560 1561

/*
 * callchain support
 */

static inline
1562
void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1563
{
1564
	if (entry->nr < PERF_MAX_STACK_DEPTH)
1565 1566 1567
		entry->ip[entry->nr++] = ip;
}

1568 1569
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584


static void
backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
{
	/* Ignore warnings */
}

static void backtrace_warning(void *data, char *msg)
{
	/* Ignore warnings */
}

static int backtrace_stack(void *data, char *name)
{
1585
	return 0;
1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

	if (reliable)
		callchain_store(entry, addr);
}

static const struct stacktrace_ops backtrace_ops = {
	.warning		= backtrace_warning,
	.warning_symbol		= backtrace_warning_symbol,
	.stack			= backtrace_stack,
	.address		= backtrace_address,
1601
	.walk_stack		= print_context_stack_bp,
1602 1603
};

1604 1605
#include "../dumpstack.h"

1606 1607 1608
static void
perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
1609
	callchain_store(entry, PERF_CONTEXT_KERNEL);
1610
	callchain_store(entry, regs->ip);
1611

1612
	dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
1613 1614
}

1615 1616 1617 1618 1619 1620 1621
static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
{
	unsigned long bytes;

	bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));

	return bytes == sizeof(*frame);
1622 1623 1624 1625 1626 1627 1628 1629
}

static void
perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	struct stack_frame frame;
	const void __user *fp;

1630 1631 1632
	if (!user_mode(regs))
		regs = task_pt_regs(current);

1633
	fp = (void __user *)regs->bp;
1634

1635
	callchain_store(entry, PERF_CONTEXT_USER);
1636 1637
	callchain_store(entry, regs->ip);

1638
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
1639
		frame.next_frame	     = NULL;
1640 1641 1642 1643 1644
		frame.return_address = 0;

		if (!copy_stack_frame(fp, &frame))
			break;

1645
		if ((unsigned long)fp < regs->sp)
1646 1647 1648
			break;

		callchain_store(entry, frame.return_address);
1649
		fp = frame.next_frame;
1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677
	}
}

static void
perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	int is_user;

	if (!regs)
		return;

	is_user = user_mode(regs);

	if (is_user && current->state != TASK_RUNNING)
		return;

	if (!is_user)
		perf_callchain_kernel(regs, entry);

	if (current->mm)
		perf_callchain_user(regs, entry);
}

struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
{
	struct perf_callchain_entry *entry;

	if (in_nmi())
1678
		entry = &__get_cpu_var(pmc_nmi_entry);
1679
	else
1680
		entry = &__get_cpu_var(pmc_irq_entry);
1681 1682 1683 1684 1685 1686 1687

	entry->nr = 0;

	perf_do_callchain(regs, entry);

	return entry;
}