perf_event.c 36.2 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2
 * Performance events x86 architecture code
I
Ingo Molnar 已提交
3
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
I
Ingo Molnar 已提交
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
I
Ingo Molnar 已提交
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
#include <linux/module.h>
I
Ingo Molnar 已提交
21 22
#include <linux/kdebug.h>
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/highmem.h>
25
#include <linux/cpu.h>
26
#include <linux/bitops.h>
I
Ingo Molnar 已提交
27 28

#include <asm/apic.h>
29
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
30
#include <asm/nmi.h>
I
Ingo Molnar 已提交
31

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
/*
 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
 */
static unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
{
	unsigned long offset, addr = (unsigned long)from;
	int type = in_nmi() ? KM_NMI : KM_IRQ0;
	unsigned long size, len = 0;
	struct page *page;
	void *map;
	int ret;

	do {
		ret = __get_user_pages_fast(addr, 1, 0, &page);
		if (!ret)
			break;

		offset = addr & (PAGE_SIZE - 1);
		size = min(PAGE_SIZE - offset, n - len);

		map = kmap_atomic(page, type);
		memcpy(to, map+offset, size);
		kunmap_atomic(map, type);
		put_page(page);

		len  += size;
		to   += size;
		addr += size;

	} while (len < n);

	return len;
}

67
static u64 perf_event_mask __read_mostly;
68

69
struct event_constraint {
70 71
	union {
		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
72
		u64		idxmsk64;
73
	};
74 75
	u64	code;
	u64	cmask;
76
	int	weight;
77 78
};

79 80 81 82 83 84 85
struct amd_nb {
	int nb_id;  /* NorthBridge id */
	int refcnt; /* reference count */
	struct perf_event *owners[X86_PMC_IDX_MAX];
	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};

86 87
#define MAX_LBR_ENTRIES		16

88
struct cpu_hw_events {
89 90 91
	/*
	 * Generic x86 PMC bits
	 */
92
	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
93
	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
94
	unsigned long		interrupts;
95
	int			enabled;
I
Ingo Molnar 已提交
96

97 98 99
	int			n_events;
	int			n_added;
	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
100
	u64			tags[X86_PMC_IDX_MAX];
101
	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
102 103 104 105 106 107 108

	/*
	 * Intel DebugStore bits
	 */
	struct debug_store	*ds;
	u64			pebs_enabled;

109 110 111 112 113 114 115 116
	/*
	 * Intel LBR bits
	 */
	int				lbr_users;
	void				*lbr_context;
	struct perf_branch_stack	lbr_stack;
	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];

117 118 119
	/*
	 * AMD specific bits
	 */
120
	struct amd_nb		*amd_nb;
121 122
};

123
#define __EVENT_CONSTRAINT(c, n, m, w) {\
124
	{ .idxmsk64 = (n) },		\
125 126
	.code = (c),			\
	.cmask = (m),			\
127
	.weight = (w),			\
128
}
129

130 131 132
#define EVENT_CONSTRAINT(c, n, m)	\
	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))

133 134 135
/*
 * Constraint on the Event code.
 */
136 137
#define INTEL_EVENT_CONSTRAINT(c, n)	\
	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
138

139 140 141
/*
 * Constraint on the Event code + UMask + fixed-mask
 */
142
#define FIXED_EVENT_CONSTRAINT(c, n)	\
143
	EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
144

145 146 147 148 149 150
/*
 * Constraint on the Event code + UMask
 */
#define PEBS_EVENT_CONSTRAINT(c, n)	\
	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)

151 152 153 154 155
#define EVENT_CONSTRAINT_END		\
	EVENT_CONSTRAINT(0, 0, 0)

#define for_each_event_constraint(e, c)	\
	for ((e) = (c); (e)->cmask; (e)++)
156

157 158 159 160 161 162 163 164 165 166 167
union perf_capabilities {
	struct {
		u64	lbr_format    : 6;
		u64	pebs_trap     : 1;
		u64	pebs_arch_reg : 1;
		u64	pebs_format   : 4;
		u64	smm_freeze    : 1;
	};
	u64	capabilities;
};

I
Ingo Molnar 已提交
168
/*
169
 * struct x86_pmu - generic x86 pmu
I
Ingo Molnar 已提交
170
 */
171
struct x86_pmu {
172 173 174
	/*
	 * Generic x86 PMC bits
	 */
175 176
	const char	*name;
	int		version;
177
	int		(*handle_irq)(struct pt_regs *);
178 179
	void		(*disable_all)(void);
	void		(*enable_all)(void);
180 181
	void		(*enable)(struct perf_event *);
	void		(*disable)(struct perf_event *);
182 183
	unsigned	eventsel;
	unsigned	perfctr;
184 185
	u64		(*event_map)(int);
	u64		(*raw_event)(u64);
186
	int		max_events;
187 188 189 190
	int		num_events;
	int		num_events_fixed;
	int		event_bits;
	u64		event_mask;
191
	int		apic;
192
	u64		max_period;
193 194 195 196
	struct event_constraint *
			(*get_event_constraints)(struct cpu_hw_events *cpuc,
						 struct perf_event *event);

197 198
	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
						 struct perf_event *event);
199
	struct event_constraint *event_constraints;
200
	void		(*quirks)(void);
201 202 203 204 205

	void		(*cpu_prepare)(int cpu);
	void		(*cpu_starting)(int cpu);
	void		(*cpu_dying)(int cpu);
	void		(*cpu_dead)(int cpu);
206 207 208 209

	/*
	 * Intel Arch Perfmon v2+
	 */
210 211
	u64			intel_ctrl;
	union perf_capabilities intel_cap;
212 213 214 215 216 217 218 219

	/*
	 * Intel DebugStore bits
	 */
	int		bts, pebs;
	int		pebs_record_size;
	void		(*drain_pebs)(struct pt_regs *regs);
	struct event_constraint *pebs_constraints;
220 221 222 223 224 225

	/*
	 * Intel LBR
	 */
	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
	int		lbr_nr;			   /* hardware stack size */
226 227
};

228
static struct x86_pmu x86_pmu __read_mostly;
229

230
static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
231 232
	.enabled = 1,
};
I
Ingo Molnar 已提交
233

234
static int x86_perf_event_set_period(struct perf_event *event);
235

236
/*
237
 * Generalized hw caching related hw_event table, filled
238
 * in on a per model basis. A value of 0 means
239 240
 * 'not supported', -1 means 'hw_event makes no sense on
 * this CPU', any other value means the raw hw_event
241 242 243 244 245 246 247 248 249 250
 * ID.
 */

#define C(x) PERF_COUNT_HW_CACHE_##x

static u64 __read_mostly hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];

251
/*
252 253
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
254 255
 * Returns the delta events processed.
 */
256
static u64
257
x86_perf_event_update(struct perf_event *event)
258
{
259
	struct hw_perf_event *hwc = &event->hw;
260
	int shift = 64 - x86_pmu.event_bits;
261
	u64 prev_raw_count, new_raw_count;
262
	int idx = hwc->idx;
263
	s64 delta;
264

265 266 267
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

268
	/*
269
	 * Careful: an NMI might modify the previous event value.
270 271 272
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
273
	 * count to the generic event atomically:
274 275 276
	 */
again:
	prev_raw_count = atomic64_read(&hwc->prev_count);
277
	rdmsrl(hwc->event_base + idx, new_raw_count);
278 279 280 281 282 283 284 285

	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
286
	 * (event-)time and add that to the generic event.
287 288
	 *
	 * Careful, not all hw sign-extends above the physical width
289
	 * of the count.
290
	 */
291 292
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
293

294
	atomic64_add(delta, &event->count);
295
	atomic64_sub(delta, &hwc->period_left);
296 297

	return new_raw_count;
298 299
}

300
static atomic_t active_events;
P
Peter Zijlstra 已提交
301 302 303 304
static DEFINE_MUTEX(pmc_reserve_mutex);

static bool reserve_pmc_hardware(void)
{
305
#ifdef CONFIG_X86_LOCAL_APIC
P
Peter Zijlstra 已提交
306 307 308 309 310
	int i;

	if (nmi_watchdog == NMI_LOCAL_APIC)
		disable_lapic_nmi_watchdog();

311
	for (i = 0; i < x86_pmu.num_events; i++) {
312
		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
P
Peter Zijlstra 已提交
313 314 315
			goto perfctr_fail;
	}

316
	for (i = 0; i < x86_pmu.num_events; i++) {
317
		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
P
Peter Zijlstra 已提交
318 319
			goto eventsel_fail;
	}
320
#endif
P
Peter Zijlstra 已提交
321 322 323

	return true;

324
#ifdef CONFIG_X86_LOCAL_APIC
P
Peter Zijlstra 已提交
325 326
eventsel_fail:
	for (i--; i >= 0; i--)
327
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
328

329
	i = x86_pmu.num_events;
P
Peter Zijlstra 已提交
330 331 332

perfctr_fail:
	for (i--; i >= 0; i--)
333
		release_perfctr_nmi(x86_pmu.perfctr + i);
P
Peter Zijlstra 已提交
334 335 336 337 338

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();

	return false;
339
#endif
P
Peter Zijlstra 已提交
340 341 342 343
}

static void release_pmc_hardware(void)
{
344
#ifdef CONFIG_X86_LOCAL_APIC
P
Peter Zijlstra 已提交
345 346
	int i;

347
	for (i = 0; i < x86_pmu.num_events; i++) {
348 349
		release_perfctr_nmi(x86_pmu.perfctr + i);
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
350 351 352 353
	}

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();
354
#endif
P
Peter Zijlstra 已提交
355 356
}

357 358
static int reserve_ds_buffers(void);
static void release_ds_buffers(void);
359

360
static void hw_perf_event_destroy(struct perf_event *event)
P
Peter Zijlstra 已提交
361
{
362
	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
363
		release_pmc_hardware();
364
		release_ds_buffers();
P
Peter Zijlstra 已提交
365 366 367 368
		mutex_unlock(&pmc_reserve_mutex);
	}
}

369 370 371 372 373
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

374
static inline int
375
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
{
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;

	return 0;
}

I
Ingo Molnar 已提交
407
/*
408
 * Setup the hardware configuration for a given attr_type
I
Ingo Molnar 已提交
409
 */
410
static int __hw_perf_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
411
{
412 413
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
414
	u64 config;
P
Peter Zijlstra 已提交
415
	int err;
I
Ingo Molnar 已提交
416

417 418
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
419

P
Peter Zijlstra 已提交
420
	err = 0;
421
	if (!atomic_inc_not_zero(&active_events)) {
P
Peter Zijlstra 已提交
422
		mutex_lock(&pmc_reserve_mutex);
423
		if (atomic_read(&active_events) == 0) {
424 425 426
			if (!reserve_pmc_hardware())
				err = -EBUSY;
			else
427
				err = reserve_ds_buffers();
428 429
		}
		if (!err)
430
			atomic_inc(&active_events);
P
Peter Zijlstra 已提交
431 432 433 434 435
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

436
	event->destroy = hw_perf_event_destroy;
437

I
Ingo Molnar 已提交
438
	/*
439
	 * Generate PMC IRQs:
I
Ingo Molnar 已提交
440 441
	 * (keep 'enabled' bit clear for now)
	 */
442
	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
I
Ingo Molnar 已提交
443

444
	hwc->idx = -1;
445 446
	hwc->last_cpu = -1;
	hwc->last_tag = ~0ULL;
447

I
Ingo Molnar 已提交
448
	/*
449
	 * Count user and OS events unless requested not to.
I
Ingo Molnar 已提交
450
	 */
451
	if (!attr->exclude_user)
452
		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
453
	if (!attr->exclude_kernel)
I
Ingo Molnar 已提交
454
		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
455

456
	if (!hwc->sample_period) {
457
		hwc->sample_period = x86_pmu.max_period;
458
		hwc->last_period = hwc->sample_period;
459
		atomic64_set(&hwc->period_left, hwc->sample_period);
460 461 462 463
	} else {
		/*
		 * If we have a PMU initialized but no APIC
		 * interrupts, we cannot sample hardware
464 465
		 * events (user-space has to fall back and
		 * sample via a hrtimer based software event):
466 467 468
		 */
		if (!x86_pmu.apic)
			return -EOPNOTSUPP;
469
	}
470

I
Ingo Molnar 已提交
471
	/*
472
	 * Raw hw_event type provide the config in the hw_event structure
I
Ingo Molnar 已提交
473
	 */
474 475
	if (attr->type == PERF_TYPE_RAW) {
		hwc->config |= x86_pmu.raw_event(attr->config);
476 477 478
		if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
		    perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
			return -EACCES;
479
		return 0;
I
Ingo Molnar 已提交
480 481
	}

482 483 484 485 486
	if (attr->type == PERF_TYPE_HW_CACHE)
		return set_ext_hw_attr(hwc, attr);

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;
487

488 489 490
	/*
	 * The generic map:
	 */
491 492 493 494 495 496 497 498
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

499 500 501 502
	/*
	 * Branch tracing:
	 */
	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
503 504
	    (hwc->sample_period == 1)) {
		/* BTS is not supported by this architecture. */
505
		if (!x86_pmu.bts)
506 507 508 509 510 511
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
			return -EOPNOTSUPP;
	}
512

513
	hwc->config |= config;
P
Peter Zijlstra 已提交
514

I
Ingo Molnar 已提交
515 516 517
	return 0;
}

518
static void x86_pmu_disable_all(void)
519
{
520
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
521 522
	int idx;

523
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
524 525
		u64 val;

526
		if (!test_bit(idx, cpuc->active_mask))
527
			continue;
528
		rdmsrl(x86_pmu.eventsel + idx, val);
529
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
530
			continue;
531
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
532
		wrmsrl(x86_pmu.eventsel + idx, val);
533 534 535
	}
}

536
void hw_perf_disable(void)
537
{
538 539
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

540
	if (!x86_pmu_initialized())
541
		return;
542

543 544 545 546 547 548
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
549 550

	x86_pmu.disable_all();
551
}
I
Ingo Molnar 已提交
552

553
static void x86_pmu_enable_all(void)
554
{
555
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
556 557
	int idx;

558 559
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
		struct perf_event *event = cpuc->events[idx];
560
		u64 val;
561

562
		if (!test_bit(idx, cpuc->active_mask))
563
			continue;
564

565
		val = event->hw.config;
566
		val |= ARCH_PERFMON_EVENTSEL_ENABLE;
567
		wrmsrl(x86_pmu.eventsel + idx, val);
568 569 570
	}
}

571 572 573 574 575 576 577 578 579
static const struct pmu pmu;

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
580
	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
581
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
582
	int i, j, w, wmax, num = 0;
583 584 585 586 587
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

	for (i = 0; i < n; i++) {
588 589
		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
		constraints[i] = c;
590 591
	}

592 593 594
	/*
	 * fastpath, try to reuse previous register
	 */
595
	for (i = 0; i < n; i++) {
596
		hwc = &cpuc->event_list[i]->hw;
597
		c = constraints[i];
598 599 600 601 602 603

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
604
		if (!test_bit(hwc->idx, c->idxmsk))
605 606 607 608 609 610
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

P
Peter Zijlstra 已提交
611
		__set_bit(hwc->idx, used_mask);
612 613 614
		if (assign)
			assign[i] = hwc->idx;
	}
615
	if (i == n)
616 617 618 619 620 621 622 623
		goto done;

	/*
	 * begin slow path
	 */

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
	/*
	 * weight = number of possible counters
	 *
	 * 1    = most constrained, only works on one counter
	 * wmax = least constrained, works on any counter
	 *
	 * assign events to counters starting with most
	 * constrained events.
	 */
	wmax = x86_pmu.num_events;

	/*
	 * when fixed event counters are present,
	 * wmax is incremented by 1 to account
	 * for one more choice
	 */
	if (x86_pmu.num_events_fixed)
		wmax++;

643
	for (w = 1, num = n; num && w <= wmax; w++) {
644
		/* for each event */
645
		for (i = 0; num && i < n; i++) {
646
			c = constraints[i];
647 648
			hwc = &cpuc->event_list[i]->hw;

649
			if (c->weight != w)
650 651
				continue;

652
			for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
653 654 655 656 657 658 659
				if (!test_bit(j, used_mask))
					break;
			}

			if (j == X86_PMC_IDX_MAX)
				break;

P
Peter Zijlstra 已提交
660
			__set_bit(j, used_mask);
661

662 663 664 665 666
			if (assign)
				assign[i] = j;
			num--;
		}
	}
667
done:
668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705
	/*
	 * scheduling failed or is just a simulation,
	 * free resources if necessary
	 */
	if (!assign || num) {
		for (i = 0; i < n; i++) {
			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
		}
	}
	return num ? -ENOSPC : 0;
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

	max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
			return -ENOSPC;
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
706
		    event->state <= PERF_EVENT_STATE_OFF)
707 708 709 710 711 712 713 714 715 716 717 718
			continue;

		if (n >= max_count)
			return -ENOSPC;

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
719
				struct cpu_hw_events *cpuc, int i)
720
{
721 722 723 724 725
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743

	if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
		hwc->config_base = 0;
		hwc->event_base	= 0;
	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		/*
		 * We set it so that event_base + idx in wrmsr/rdmsr maps to
		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
		 */
		hwc->event_base =
			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
	} else {
		hwc->config_base = x86_pmu.eventsel;
		hwc->event_base  = x86_pmu.perfctr;
	}
}

744 745 746 747 748 749 750 751 752
static inline int match_prev_assignment(struct hw_perf_event *hwc,
					struct cpu_hw_events *cpuc,
					int i)
{
	return hwc->idx == cpuc->assign[i] &&
		hwc->last_cpu == smp_processor_id() &&
		hwc->last_tag == cpuc->tags[i];
}

P
Peter Zijlstra 已提交
753
static int x86_pmu_start(struct perf_event *event);
754
static void x86_pmu_stop(struct perf_event *event);
755

756
void hw_perf_enable(void)
757
{
758 759 760 761 762
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	struct perf_event *event;
	struct hw_perf_event *hwc;
	int i;

763
	if (!x86_pmu_initialized())
764
		return;
765 766 767 768

	if (cpuc->enabled)
		return;

769
	if (cpuc->n_added) {
770
		int n_running = cpuc->n_events - cpuc->n_added;
771 772 773 774 775 776 777
		/*
		 * apply assignment obtained either from
		 * hw_perf_group_sched_in() or x86_pmu_enable()
		 *
		 * step1: save events moving to new counters
		 * step2: reprogram moved events into new counters
		 */
778
		for (i = 0; i < n_running; i++) {
779 780 781 782

			event = cpuc->event_list[i];
			hwc = &event->hw;

783 784 785 786 787 788 789 790
			/*
			 * we can avoid reprogramming counter if:
			 * - assigned same counter as last time
			 * - running on same CPU as last time
			 * - no other event has used the counter since
			 */
			if (hwc->idx == -1 ||
			    match_prev_assignment(hwc, cpuc, i))
791 792
				continue;

793
			x86_pmu_stop(event);
794 795 796 797 798 799 800 801 802

			hwc->idx = -1;
		}

		for (i = 0; i < cpuc->n_events; i++) {

			event = cpuc->event_list[i];
			hwc = &event->hw;

803 804 805 806
			if (i < n_running &&
			    match_prev_assignment(hwc, cpuc, i))
				continue;

P
Peter Zijlstra 已提交
807
			if (hwc->idx == -1)
808
				x86_assign_hw_event(event, cpuc, i);
809

P
Peter Zijlstra 已提交
810
			x86_pmu_start(event);
811 812 813 814
		}
		cpuc->n_added = 0;
		perf_events_lapic_init();
	}
815 816 817 818

	cpuc->enabled = 1;
	barrier();

819
	x86_pmu.enable_all();
820 821
}

822
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
823
{
824
	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
825
			      hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
826 827
}

828
static inline void x86_pmu_disable_event(struct perf_event *event)
829
{
830 831
	struct hw_perf_event *hwc = &event->hw;
	(void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
832 833
}

834
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
I
Ingo Molnar 已提交
835

836 837
/*
 * Set the next IRQ period, based on the hwc->period_left value.
838
 * To be called with the event disabled in hw:
839
 */
840
static int
841
x86_perf_event_set_period(struct perf_event *event)
I
Ingo Molnar 已提交
842
{
843
	struct hw_perf_event *hwc = &event->hw;
844
	s64 left = atomic64_read(&hwc->period_left);
845
	s64 period = hwc->sample_period;
846
	int err, ret = 0, idx = hwc->idx;
847

848 849 850
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

851
	/*
852
	 * If we are way outside a reasonable range then just skip forward:
853 854 855 856
	 */
	if (unlikely(left <= -period)) {
		left = period;
		atomic64_set(&hwc->period_left, left);
857
		hwc->last_period = period;
858
		ret = 1;
859 860 861 862 863
	}

	if (unlikely(left <= 0)) {
		left += period;
		atomic64_set(&hwc->period_left, left);
864
		hwc->last_period = period;
865
		ret = 1;
866
	}
867
	/*
868
	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
869 870 871
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
872

873 874 875
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

876
	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
877 878

	/*
879
	 * The hw event starts counting from this event offset,
880 881
	 * mark it to be able to extra future deltas:
	 */
882
	atomic64_set(&hwc->prev_count, (u64)-left);
883

884 885
	err = checking_wrmsrl(hwc->event_base + idx,
			     (u64)(-left) & x86_pmu.event_mask);
886

887
	perf_event_update_userpage(event);
888

889
	return ret;
890 891
}

892
static void x86_pmu_enable_event(struct perf_event *event)
893
{
894
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
895
	if (cpuc->enabled)
896
		__x86_pmu_enable_event(&event->hw);
I
Ingo Molnar 已提交
897 898
}

899
/*
900 901 902 903 904 905 906
 * activate a single event
 *
 * The event is added to the group of enabled events
 * but only if it can be scehduled with existing events.
 *
 * Called with PMU disabled. If successful and return value 1,
 * then guaranteed to call perf_enable() and hw_perf_enable()
907 908 909 910
 */
static int x86_pmu_enable(struct perf_event *event)
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
911 912 913
	struct hw_perf_event *hwc;
	int assign[X86_PMC_IDX_MAX];
	int n, n0, ret;
914

915
	hwc = &event->hw;
916

917 918 919 920
	n0 = cpuc->n_events;
	n = collect_events(cpuc, event, false);
	if (n < 0)
		return n;
921

922 923 924 925 926 927 928 929
	ret = x86_schedule_events(cpuc, n, assign);
	if (ret)
		return ret;
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));
930

931
	cpuc->n_events = n;
932
	cpuc->n_added += n - n0;
933 934

	return 0;
I
Ingo Molnar 已提交
935 936
}

937 938
static int x86_pmu_start(struct perf_event *event)
{
P
Peter Zijlstra 已提交
939 940 941 942
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int idx = event->hw.idx;

	if (idx == -1)
943 944
		return -EAGAIN;

945
	x86_perf_event_set_period(event);
P
Peter Zijlstra 已提交
946 947
	cpuc->events[idx] = event;
	__set_bit(idx, cpuc->active_mask);
948
	x86_pmu.enable(event);
P
Peter Zijlstra 已提交
949
	perf_event_update_userpage(event);
950 951 952 953

	return 0;
}

954
static void x86_pmu_unthrottle(struct perf_event *event)
955
{
956 957
	int ret = x86_pmu_start(event);
	WARN_ON_ONCE(ret);
958 959
}

960
void perf_event_print_debug(void)
I
Ingo Molnar 已提交
961
{
962
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
963
	u64 pebs;
964
	struct cpu_hw_events *cpuc;
965
	unsigned long flags;
966 967
	int cpu, idx;

968
	if (!x86_pmu.num_events)
969
		return;
I
Ingo Molnar 已提交
970

971
	local_irq_save(flags);
I
Ingo Molnar 已提交
972 973

	cpu = smp_processor_id();
974
	cpuc = &per_cpu(cpu_hw_events, cpu);
I
Ingo Molnar 已提交
975

976
	if (x86_pmu.version >= 2) {
977 978 979 980
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
981
		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
982 983 984 985 986 987

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
988
		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
989
	}
990
	pr_info("CPU#%d: active:       %016llx\n", cpu, *(u64 *)cpuc->active_mask);
I
Ingo Molnar 已提交
991

992
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
993 994
		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
I
Ingo Molnar 已提交
995

996
		prev_left = per_cpu(pmc_prev_left[idx], cpu);
I
Ingo Molnar 已提交
997

998
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
999
			cpu, idx, pmc_ctrl);
1000
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
1001
			cpu, idx, pmc_count);
1002
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1003
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
1004
	}
1005
	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1006 1007
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

1008
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1009 1010
			cpu, idx, pmc_count);
	}
1011
	local_irq_restore(flags);
I
Ingo Molnar 已提交
1012 1013
}

1014
static void x86_pmu_stop(struct perf_event *event)
I
Ingo Molnar 已提交
1015
{
1016
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1017
	struct hw_perf_event *hwc = &event->hw;
1018
	int idx = hwc->idx;
I
Ingo Molnar 已提交
1019

1020 1021 1022
	if (!__test_and_clear_bit(idx, cpuc->active_mask))
		return;

1023
	x86_pmu.disable(event);
I
Ingo Molnar 已提交
1024

1025
	/*
1026
	 * Drain the remaining delta count out of a event
1027 1028
	 * that we are disabling:
	 */
1029
	x86_perf_event_update(event);
1030

1031
	cpuc->events[idx] = NULL;
1032 1033 1034 1035 1036 1037 1038
}

static void x86_pmu_disable(struct perf_event *event)
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int i;

1039
	x86_pmu_stop(event);
1040

1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
	for (i = 0; i < cpuc->n_events; i++) {
		if (event == cpuc->event_list[i]) {

			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, event);

			while (++i < cpuc->n_events)
				cpuc->event_list[i-1] = cpuc->event_list[i];

			--cpuc->n_events;
1051
			break;
1052 1053
		}
	}
1054
	perf_event_update_userpage(event);
I
Ingo Molnar 已提交
1055 1056
}

1057
static int x86_pmu_handle_irq(struct pt_regs *regs)
1058
{
1059
	struct perf_sample_data data;
1060 1061 1062
	struct cpu_hw_events *cpuc;
	struct perf_event *event;
	struct hw_perf_event *hwc;
V
Vince Weaver 已提交
1063
	int idx, handled = 0;
1064 1065
	u64 val;

1066
	perf_sample_data_init(&data, 0);
1067

1068
	cpuc = &__get_cpu_var(cpu_hw_events);
1069

1070
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
1071
		if (!test_bit(idx, cpuc->active_mask))
1072
			continue;
1073

1074 1075
		event = cpuc->events[idx];
		hwc = &event->hw;
1076

1077
		val = x86_perf_event_update(event);
1078
		if (val & (1ULL << (x86_pmu.event_bits - 1)))
1079
			continue;
1080

1081
		/*
1082
		 * event overflow
1083 1084
		 */
		handled		= 1;
1085
		data.period	= event->hw.last_period;
1086

1087
		if (!x86_perf_event_set_period(event))
1088 1089
			continue;

1090
		if (perf_event_overflow(event, 1, &data, regs))
1091
			x86_pmu_stop(event);
1092
	}
1093

1094 1095 1096
	if (handled)
		inc_irq_stat(apic_perf_irqs);

1097 1098
	return handled;
}
1099

1100 1101 1102 1103 1104
void smp_perf_pending_interrupt(struct pt_regs *regs)
{
	irq_enter();
	ack_APIC_irq();
	inc_irq_stat(apic_pending_irqs);
1105
	perf_event_do_pending();
1106 1107 1108
	irq_exit();
}

1109
void set_perf_event_pending(void)
1110
{
1111
#ifdef CONFIG_X86_LOCAL_APIC
1112 1113 1114
	if (!x86_pmu.apic || !x86_pmu_initialized())
		return;

1115
	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1116
#endif
1117 1118
}

1119
void perf_events_lapic_init(void)
I
Ingo Molnar 已提交
1120
{
1121 1122
#ifdef CONFIG_X86_LOCAL_APIC
	if (!x86_pmu.apic || !x86_pmu_initialized())
I
Ingo Molnar 已提交
1123
		return;
1124

I
Ingo Molnar 已提交
1125
	/*
1126
	 * Always use NMI for PMU
I
Ingo Molnar 已提交
1127
	 */
1128
	apic_write(APIC_LVTPC, APIC_DM_NMI);
1129
#endif
I
Ingo Molnar 已提交
1130 1131 1132
}

static int __kprobes
1133
perf_event_nmi_handler(struct notifier_block *self,
I
Ingo Molnar 已提交
1134 1135 1136 1137
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
	struct pt_regs *regs;
1138

1139
	if (!atomic_read(&active_events))
1140 1141
		return NOTIFY_DONE;

1142 1143 1144 1145
	switch (cmd) {
	case DIE_NMI:
	case DIE_NMI_IPI:
		break;
I
Ingo Molnar 已提交
1146

1147
	default:
I
Ingo Molnar 已提交
1148
		return NOTIFY_DONE;
1149
	}
I
Ingo Molnar 已提交
1150 1151 1152

	regs = args->regs;

1153
#ifdef CONFIG_X86_LOCAL_APIC
I
Ingo Molnar 已提交
1154
	apic_write(APIC_LVTPC, APIC_DM_NMI);
1155
#endif
1156 1157
	/*
	 * Can't rely on the handled return value to say it was our NMI, two
1158
	 * events could trigger 'simultaneously' raising two back-to-back NMIs.
1159 1160 1161 1162
	 *
	 * If the first NMI handles both, the latter will be empty and daze
	 * the CPU.
	 */
1163
	x86_pmu.handle_irq(regs);
I
Ingo Molnar 已提交
1164

1165
	return NOTIFY_STOP;
I
Ingo Molnar 已提交
1166 1167
}

1168 1169 1170 1171 1172 1173
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
	.notifier_call		= perf_event_nmi_handler,
	.next			= NULL,
	.priority		= 1
};

1174
static struct event_constraint unconstrained;
1175
static struct event_constraint emptyconstraint;
1176 1177

static struct event_constraint *
1178
x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1179
{
1180
	struct event_constraint *c;
1181 1182 1183

	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1184 1185
			if ((event->hw.config & c->cmask) == c->code)
				return c;
1186 1187
		}
	}
1188 1189

	return &unconstrained;
1190 1191 1192
}

static int x86_event_sched_in(struct perf_event *event,
1193
			  struct perf_cpu_context *cpuctx)
1194 1195 1196 1197
{
	int ret = 0;

	event->state = PERF_EVENT_STATE_ACTIVE;
1198
	event->oncpu = smp_processor_id();
1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213
	event->tstamp_running += event->ctx->time - event->tstamp_stopped;

	if (!is_x86_event(event))
		ret = event->pmu->enable(event);

	if (!ret && !is_software_event(event))
		cpuctx->active_oncpu++;

	if (!ret && event->attr.exclusive)
		cpuctx->exclusive = 1;

	return ret;
}

static void x86_event_sched_out(struct perf_event *event,
1214
			    struct perf_cpu_context *cpuctx)
1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241
{
	event->state = PERF_EVENT_STATE_INACTIVE;
	event->oncpu = -1;

	if (!is_x86_event(event))
		event->pmu->disable(event);

	event->tstamp_running -= event->ctx->time - event->tstamp_stopped;

	if (!is_software_event(event))
		cpuctx->active_oncpu--;

	if (event->attr.exclusive || !cpuctx->active_oncpu)
		cpuctx->exclusive = 0;
}

/*
 * Called to enable a whole group of events.
 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
 * Assumes the caller has disabled interrupts and has
 * frozen the PMU with hw_perf_save_disable.
 *
 * called with PMU disabled. If successful and return value 1,
 * then guaranteed to call perf_enable() and hw_perf_enable()
 */
int hw_perf_group_sched_in(struct perf_event *leader,
	       struct perf_cpu_context *cpuctx,
1242
	       struct perf_event_context *ctx)
1243
{
1244
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
	struct perf_event *sub;
	int assign[X86_PMC_IDX_MAX];
	int n0, n1, ret;

	/* n0 = total number of events */
	n0 = collect_events(cpuc, leader, true);
	if (n0 < 0)
		return n0;

	ret = x86_schedule_events(cpuc, n0, assign);
	if (ret)
		return ret;

1258
	ret = x86_event_sched_in(leader, cpuctx);
1259 1260 1261 1262 1263
	if (ret)
		return ret;

	n1 = 1;
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1264
		if (sub->state > PERF_EVENT_STATE_OFF) {
1265
			ret = x86_event_sched_in(sub, cpuctx);
1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277
			if (ret)
				goto undo;
			++n1;
		}
	}
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n0*sizeof(int));

	cpuc->n_events  = n0;
1278
	cpuc->n_added  += n1;
1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289
	ctx->nr_active += n1;

	/*
	 * 1 means successful and events are active
	 * This is not quite true because we defer
	 * actual activation until hw_perf_enable() but
	 * this way we* ensure caller won't try to enable
	 * individual events
	 */
	return 1;
undo:
1290
	x86_event_sched_out(leader, cpuctx);
1291 1292 1293
	n0  = 1;
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
1294
			x86_event_sched_out(sub, cpuctx);
1295 1296 1297 1298 1299 1300 1301
			if (++n0 == n1)
				break;
		}
	}
	return ret;
}

1302 1303
#include "perf_event_amd.c"
#include "perf_event_p6.c"
1304
#include "perf_event_intel_lbr.c"
1305
#include "perf_event_intel_ds.c"
1306
#include "perf_event_intel.c"
1307

1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340
static int __cpuinit
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
		if (x86_pmu.cpu_prepare)
			x86_pmu.cpu_prepare(cpu);
		break;

	case CPU_STARTING:
		if (x86_pmu.cpu_starting)
			x86_pmu.cpu_starting(cpu);
		break;

	case CPU_DYING:
		if (x86_pmu.cpu_dying)
			x86_pmu.cpu_dying(cpu);
		break;

	case CPU_DEAD:
		if (x86_pmu.cpu_dead)
			x86_pmu.cpu_dead(cpu);
		break;

	default:
		break;
	}

	return NOTIFY_OK;
}

1341 1342 1343 1344 1345 1346 1347 1348 1349 1350
static void __init pmu_check_apic(void)
{
	if (cpu_has_apic)
		return;

	x86_pmu.apic = 0;
	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
	pr_info("no hardware sampling interrupt available.\n");
}

1351
void __init init_hw_perf_events(void)
1352
{
1353
	struct event_constraint *c;
1354 1355
	int err;

1356
	pr_info("Performance Events: ");
1357

1358 1359
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1360
		err = intel_pmu_init();
1361
		break;
1362
	case X86_VENDOR_AMD:
1363
		err = amd_pmu_init();
1364
		break;
1365 1366
	default:
		return;
1367
	}
1368
	if (err != 0) {
1369
		pr_cont("no PMU driver, software events only.\n");
1370
		return;
1371
	}
1372

1373 1374
	pmu_check_apic();

1375
	pr_cont("%s PMU driver.\n", x86_pmu.name);
1376

1377 1378 1379
	if (x86_pmu.quirks)
		x86_pmu.quirks();

1380 1381 1382 1383
	if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
		x86_pmu.num_events = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
1384
	}
1385 1386
	perf_event_mask = (1 << x86_pmu.num_events) - 1;
	perf_max_events = x86_pmu.num_events;
I
Ingo Molnar 已提交
1387

1388 1389 1390 1391
	if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
		     x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
		x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
1392
	}
1393

1394 1395 1396
	perf_event_mask |=
		((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
	x86_pmu.intel_ctrl = perf_event_mask;
I
Ingo Molnar 已提交
1397

1398 1399
	perf_events_lapic_init();
	register_die_notifier(&perf_event_nmi_notifier);
1400

1401
	unconstrained = (struct event_constraint)
1402 1403
		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
				   0, x86_pmu.num_events);
1404

1405 1406 1407 1408 1409 1410 1411 1412 1413 1414
	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
			if (c->cmask != INTEL_ARCH_FIXED_MASK)
				continue;

			c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
			c->weight += x86_pmu.num_events;
		}
	}

I
Ingo Molnar 已提交
1415 1416 1417 1418 1419 1420 1421
	pr_info("... version:                %d\n",     x86_pmu.version);
	pr_info("... bit width:              %d\n",     x86_pmu.event_bits);
	pr_info("... generic registers:      %d\n",     x86_pmu.num_events);
	pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask);
	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);
	pr_info("... event mask:             %016Lx\n", perf_event_mask);
1422 1423

	perf_cpu_notifier(x86_pmu_notifier);
I
Ingo Molnar 已提交
1424
}
I
Ingo Molnar 已提交
1425

1426
static inline void x86_pmu_read(struct perf_event *event)
1427
{
1428
	x86_perf_event_update(event);
1429 1430
}

1431 1432 1433
static const struct pmu pmu = {
	.enable		= x86_pmu_enable,
	.disable	= x86_pmu_disable,
1434 1435
	.start		= x86_pmu_start,
	.stop		= x86_pmu_stop,
1436
	.read		= x86_pmu_read,
1437
	.unthrottle	= x86_pmu_unthrottle,
I
Ingo Molnar 已提交
1438 1439
};

1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465
/*
 * validate that we can schedule this event
 */
static int validate_event(struct perf_event *event)
{
	struct cpu_hw_events *fake_cpuc;
	struct event_constraint *c;
	int ret = 0;

	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
	if (!fake_cpuc)
		return -ENOMEM;

	c = x86_pmu.get_event_constraints(fake_cpuc, event);

	if (!c || !c->weight)
		ret = -ENOSPC;

	if (x86_pmu.put_event_constraints)
		x86_pmu.put_event_constraints(fake_cpuc, event);

	kfree(fake_cpuc);

	return ret;
}

1466 1467 1468 1469
/*
 * validate a single event group
 *
 * validation include:
1470 1471 1472
 *	- check events are compatible which each other
 *	- events do not compete for the same counter
 *	- number of events <= number of counters
1473 1474 1475 1476
 *
 * validation ensures the group can be loaded onto the
 * PMU if it was the only group available.
 */
1477 1478
static int validate_group(struct perf_event *event)
{
1479
	struct perf_event *leader = event->group_leader;
1480 1481
	struct cpu_hw_events *fake_cpuc;
	int ret, n;
1482

1483 1484 1485 1486
	ret = -ENOMEM;
	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
	if (!fake_cpuc)
		goto out;
1487

1488 1489 1490 1491 1492 1493
	/*
	 * the event is not yet connected with its
	 * siblings therefore we must first collect
	 * existing siblings, then add the new event
	 * before we can simulate the scheduling
	 */
1494 1495
	ret = -ENOSPC;
	n = collect_events(fake_cpuc, leader, true);
1496
	if (n < 0)
1497
		goto out_free;
1498

1499 1500
	fake_cpuc->n_events = n;
	n = collect_events(fake_cpuc, event, false);
1501
	if (n < 0)
1502
		goto out_free;
1503

1504
	fake_cpuc->n_events = n;
1505

1506 1507 1508 1509 1510 1511
	ret = x86_schedule_events(fake_cpuc, n, NULL);

out_free:
	kfree(fake_cpuc);
out:
	return ret;
1512 1513
}

1514
const struct pmu *hw_perf_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
1515
{
1516
	const struct pmu *tmp;
I
Ingo Molnar 已提交
1517 1518
	int err;

1519
	err = __hw_perf_event_init(event);
1520
	if (!err) {
1521 1522 1523 1524 1525 1526 1527 1528
		/*
		 * we temporarily connect event to its pmu
		 * such that validate_group() can classify
		 * it as an x86 event using is_x86_event()
		 */
		tmp = event->pmu;
		event->pmu = &pmu;

1529 1530
		if (event->group_leader != event)
			err = validate_group(event);
1531 1532
		else
			err = validate_event(event);
1533 1534

		event->pmu = tmp;
1535
	}
1536
	if (err) {
1537 1538
		if (event->destroy)
			event->destroy(event);
1539
		return ERR_PTR(err);
1540
	}
I
Ingo Molnar 已提交
1541

1542
	return &pmu;
I
Ingo Molnar 已提交
1543
}
1544 1545 1546 1547 1548 1549

/*
 * callchain support
 */

static inline
1550
void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1551
{
1552
	if (entry->nr < PERF_MAX_STACK_DEPTH)
1553 1554 1555
		entry->ip[entry->nr++] = ip;
}

1556 1557
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572


static void
backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
{
	/* Ignore warnings */
}

static void backtrace_warning(void *data, char *msg)
{
	/* Ignore warnings */
}

static int backtrace_stack(void *data, char *name)
{
1573
	return 0;
1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

	if (reliable)
		callchain_store(entry, addr);
}

static const struct stacktrace_ops backtrace_ops = {
	.warning		= backtrace_warning,
	.warning_symbol		= backtrace_warning_symbol,
	.stack			= backtrace_stack,
	.address		= backtrace_address,
1589
	.walk_stack		= print_context_stack_bp,
1590 1591
};

1592 1593
#include "../dumpstack.h"

1594 1595 1596
static void
perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
1597
	callchain_store(entry, PERF_CONTEXT_KERNEL);
1598
	callchain_store(entry, regs->ip);
1599

1600
	dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
1601 1602
}

1603 1604 1605 1606 1607 1608 1609
static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
{
	unsigned long bytes;

	bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));

	return bytes == sizeof(*frame);
1610 1611 1612 1613 1614 1615 1616 1617
}

static void
perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	struct stack_frame frame;
	const void __user *fp;

1618 1619 1620
	if (!user_mode(regs))
		regs = task_pt_regs(current);

1621
	fp = (void __user *)regs->bp;
1622

1623
	callchain_store(entry, PERF_CONTEXT_USER);
1624 1625
	callchain_store(entry, regs->ip);

1626
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
1627
		frame.next_frame	     = NULL;
1628 1629 1630 1631 1632
		frame.return_address = 0;

		if (!copy_stack_frame(fp, &frame))
			break;

1633
		if ((unsigned long)fp < regs->sp)
1634 1635 1636
			break;

		callchain_store(entry, frame.return_address);
1637
		fp = frame.next_frame;
1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665
	}
}

static void
perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	int is_user;

	if (!regs)
		return;

	is_user = user_mode(regs);

	if (is_user && current->state != TASK_RUNNING)
		return;

	if (!is_user)
		perf_callchain_kernel(regs, entry);

	if (current->mm)
		perf_callchain_user(regs, entry);
}

struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
{
	struct perf_callchain_entry *entry;

	if (in_nmi())
1666
		entry = &__get_cpu_var(pmc_nmi_entry);
1667
	else
1668
		entry = &__get_cpu_var(pmc_irq_entry);
1669 1670 1671 1672 1673 1674 1675

	entry->nr = 0;

	perf_do_callchain(regs, entry);

	return entry;
}