perf_event.c 37.9 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2
 * Performance events x86 architecture code
I
Ingo Molnar 已提交
3
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
I
Ingo Molnar 已提交
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
I
Ingo Molnar 已提交
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
#include <linux/module.h>
I
Ingo Molnar 已提交
21 22
#include <linux/kdebug.h>
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/slab.h>
25
#include <linux/highmem.h>
26
#include <linux/cpu.h>
27
#include <linux/bitops.h>
I
Ingo Molnar 已提交
28 29

#include <asm/apic.h>
30
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
31
#include <asm/nmi.h>
32
#include <asm/compat.h>
I
Ingo Molnar 已提交
33

34 35 36 37 38 39 40 41 42 43 44
#if 0
#undef wrmsrl
#define wrmsrl(msr, val) 					\
do {								\
	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
			(unsigned long)(val));			\
	native_write_msr((msr), (u32)((u64)(val)), 		\
			(u32)((u64)(val) >> 32));		\
} while (0)
#endif

45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
/*
 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
 */
static unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
{
	unsigned long offset, addr = (unsigned long)from;
	int type = in_nmi() ? KM_NMI : KM_IRQ0;
	unsigned long size, len = 0;
	struct page *page;
	void *map;
	int ret;

	do {
		ret = __get_user_pages_fast(addr, 1, 0, &page);
		if (!ret)
			break;

		offset = addr & (PAGE_SIZE - 1);
		size = min(PAGE_SIZE - offset, n - len);

		map = kmap_atomic(page, type);
		memcpy(to, map+offset, size);
		kunmap_atomic(map, type);
		put_page(page);

		len  += size;
		to   += size;
		addr += size;

	} while (len < n);

	return len;
}

80
struct event_constraint {
81 82
	union {
		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
83
		u64		idxmsk64;
84
	};
85 86
	u64	code;
	u64	cmask;
87
	int	weight;
88 89
};

90 91 92 93 94 95 96
struct amd_nb {
	int nb_id;  /* NorthBridge id */
	int refcnt; /* reference count */
	struct perf_event *owners[X86_PMC_IDX_MAX];
	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};

97 98
#define MAX_LBR_ENTRIES		16

99
struct cpu_hw_events {
100 101 102
	/*
	 * Generic x86 PMC bits
	 */
103
	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
104
	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
105
	int			enabled;
I
Ingo Molnar 已提交
106

107 108 109
	int			n_events;
	int			n_added;
	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
110
	u64			tags[X86_PMC_IDX_MAX];
111
	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
112

113 114
	unsigned int		group_flag;

115 116 117 118 119 120
	/*
	 * Intel DebugStore bits
	 */
	struct debug_store	*ds;
	u64			pebs_enabled;

121 122 123 124 125 126 127 128
	/*
	 * Intel LBR bits
	 */
	int				lbr_users;
	void				*lbr_context;
	struct perf_branch_stack	lbr_stack;
	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];

129 130 131
	/*
	 * AMD specific bits
	 */
132
	struct amd_nb		*amd_nb;
133 134
};

135
#define __EVENT_CONSTRAINT(c, n, m, w) {\
136
	{ .idxmsk64 = (n) },		\
137 138
	.code = (c),			\
	.cmask = (m),			\
139
	.weight = (w),			\
140
}
141

142 143 144
#define EVENT_CONSTRAINT(c, n, m)	\
	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))

145 146 147
/*
 * Constraint on the Event code.
 */
148
#define INTEL_EVENT_CONSTRAINT(c, n)	\
149
	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
150

151 152
/*
 * Constraint on the Event code + UMask + fixed-mask
153 154 155 156 157 158 159 160
 *
 * filter mask to validate fixed counter events.
 * the following filters disqualify for fixed counters:
 *  - inv
 *  - edge
 *  - cnt-mask
 *  The other filters are supported by fixed counters.
 *  The any-thread option is supported starting with v3.
161
 */
162
#define FIXED_EVENT_CONSTRAINT(c, n)	\
163
	EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
164

165 166 167 168 169 170
/*
 * Constraint on the Event code + UMask
 */
#define PEBS_EVENT_CONSTRAINT(c, n)	\
	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)

171 172 173 174
#define EVENT_CONSTRAINT_END		\
	EVENT_CONSTRAINT(0, 0, 0)

#define for_each_event_constraint(e, c)	\
175
	for ((e) = (c); (e)->weight; (e)++)
176

177 178 179 180 181 182 183 184 185 186 187
union perf_capabilities {
	struct {
		u64	lbr_format    : 6;
		u64	pebs_trap     : 1;
		u64	pebs_arch_reg : 1;
		u64	pebs_format   : 4;
		u64	smm_freeze    : 1;
	};
	u64	capabilities;
};

I
Ingo Molnar 已提交
188
/*
189
 * struct x86_pmu - generic x86 pmu
I
Ingo Molnar 已提交
190
 */
191
struct x86_pmu {
192 193 194
	/*
	 * Generic x86 PMC bits
	 */
195 196
	const char	*name;
	int		version;
197
	int		(*handle_irq)(struct pt_regs *);
198
	void		(*disable_all)(void);
199
	void		(*enable_all)(int added);
200 201
	void		(*enable)(struct perf_event *);
	void		(*disable)(struct perf_event *);
202
	int		(*hw_config)(struct perf_event *event);
203
	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
204 205
	unsigned	eventsel;
	unsigned	perfctr;
206
	u64		(*event_map)(int);
207
	int		max_events;
208 209 210 211
	int		num_counters;
	int		num_counters_fixed;
	int		cntval_bits;
	u64		cntval_mask;
212
	int		apic;
213
	u64		max_period;
214 215 216 217
	struct event_constraint *
			(*get_event_constraints)(struct cpu_hw_events *cpuc,
						 struct perf_event *event);

218 219
	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
						 struct perf_event *event);
220
	struct event_constraint *event_constraints;
221
	void		(*quirks)(void);
222

223
	int		(*cpu_prepare)(int cpu);
224 225 226
	void		(*cpu_starting)(int cpu);
	void		(*cpu_dying)(int cpu);
	void		(*cpu_dead)(int cpu);
227 228 229 230

	/*
	 * Intel Arch Perfmon v2+
	 */
231 232
	u64			intel_ctrl;
	union perf_capabilities intel_cap;
233 234 235 236 237 238 239 240

	/*
	 * Intel DebugStore bits
	 */
	int		bts, pebs;
	int		pebs_record_size;
	void		(*drain_pebs)(struct pt_regs *regs);
	struct event_constraint *pebs_constraints;
241 242 243 244 245 246

	/*
	 * Intel LBR
	 */
	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
	int		lbr_nr;			   /* hardware stack size */
247 248
};

249
static struct x86_pmu x86_pmu __read_mostly;
250

251
static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
252 253
	.enabled = 1,
};
I
Ingo Molnar 已提交
254

255
static int x86_perf_event_set_period(struct perf_event *event);
256

257
/*
258
 * Generalized hw caching related hw_event table, filled
259
 * in on a per model basis. A value of 0 means
260 261
 * 'not supported', -1 means 'hw_event makes no sense on
 * this CPU', any other value means the raw hw_event
262 263 264 265 266 267 268 269 270 271
 * ID.
 */

#define C(x) PERF_COUNT_HW_CACHE_##x

static u64 __read_mostly hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];

272
/*
273 274
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
275 276
 * Returns the delta events processed.
 */
277
static u64
278
x86_perf_event_update(struct perf_event *event)
279
{
280
	struct hw_perf_event *hwc = &event->hw;
281
	int shift = 64 - x86_pmu.cntval_bits;
282
	u64 prev_raw_count, new_raw_count;
283
	int idx = hwc->idx;
284
	s64 delta;
285

286 287 288
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

289
	/*
290
	 * Careful: an NMI might modify the previous event value.
291 292 293
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
294
	 * count to the generic event atomically:
295 296 297
	 */
again:
	prev_raw_count = atomic64_read(&hwc->prev_count);
298
	rdmsrl(hwc->event_base + idx, new_raw_count);
299 300 301 302 303 304 305 306

	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
307
	 * (event-)time and add that to the generic event.
308 309
	 *
	 * Careful, not all hw sign-extends above the physical width
310
	 * of the count.
311
	 */
312 313
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
314

315
	atomic64_add(delta, &event->count);
316
	atomic64_sub(delta, &hwc->period_left);
317 318

	return new_raw_count;
319 320
}

321
static atomic_t active_events;
P
Peter Zijlstra 已提交
322 323
static DEFINE_MUTEX(pmc_reserve_mutex);

324 325
#ifdef CONFIG_X86_LOCAL_APIC

P
Peter Zijlstra 已提交
326 327 328 329 330 331 332
static bool reserve_pmc_hardware(void)
{
	int i;

	if (nmi_watchdog == NMI_LOCAL_APIC)
		disable_lapic_nmi_watchdog();

333
	for (i = 0; i < x86_pmu.num_counters; i++) {
334
		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
P
Peter Zijlstra 已提交
335 336 337
			goto perfctr_fail;
	}

338
	for (i = 0; i < x86_pmu.num_counters; i++) {
339
		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
P
Peter Zijlstra 已提交
340 341 342 343 344 345 346
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
347
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
348

349
	i = x86_pmu.num_counters;
P
Peter Zijlstra 已提交
350 351 352

perfctr_fail:
	for (i--; i >= 0; i--)
353
		release_perfctr_nmi(x86_pmu.perfctr + i);
P
Peter Zijlstra 已提交
354 355 356 357 358 359 360 361 362 363 364

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

365
	for (i = 0; i < x86_pmu.num_counters; i++) {
366 367
		release_perfctr_nmi(x86_pmu.perfctr + i);
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
368 369 370 371 372 373
	}

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();
}

374 375 376 377 378 379 380
#else

static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}

#endif

381 382
static int reserve_ds_buffers(void);
static void release_ds_buffers(void);
383

384
static void hw_perf_event_destroy(struct perf_event *event)
P
Peter Zijlstra 已提交
385
{
386
	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
387
		release_pmc_hardware();
388
		release_ds_buffers();
P
Peter Zijlstra 已提交
389 390 391 392
		mutex_unlock(&pmc_reserve_mutex);
	}
}

393 394 395 396 397
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

398
static inline int
399
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
{
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;

	return 0;
}

431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489
static int x86_setup_perfctr(struct perf_event *event)
{
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
	u64 config;

	if (!hwc->sample_period) {
		hwc->sample_period = x86_pmu.max_period;
		hwc->last_period = hwc->sample_period;
		atomic64_set(&hwc->period_left, hwc->sample_period);
	} else {
		/*
		 * If we have a PMU initialized but no APIC
		 * interrupts, we cannot sample hardware
		 * events (user-space has to fall back and
		 * sample via a hrtimer based software event):
		 */
		if (!x86_pmu.apic)
			return -EOPNOTSUPP;
	}

	if (attr->type == PERF_TYPE_RAW)
		return 0;

	if (attr->type == PERF_TYPE_HW_CACHE)
		return set_ext_hw_attr(hwc, attr);

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;

	/*
	 * The generic map:
	 */
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

	/*
	 * Branch tracing:
	 */
	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
	    (hwc->sample_period == 1)) {
		/* BTS is not supported by this architecture. */
		if (!x86_pmu.bts)
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (!attr->exclude_kernel)
			return -EOPNOTSUPP;
	}

	hwc->config |= config;

	return 0;
}
490

491
static int x86_pmu_hw_config(struct perf_event *event)
492
{
P
Peter Zijlstra 已提交
493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
	if (event->attr.precise_ip) {
		int precise = 0;

		/* Support for constant skid */
		if (x86_pmu.pebs)
			precise++;

		/* Support for IP fixup */
		if (x86_pmu.lbr_nr)
			precise++;

		if (event->attr.precise_ip > precise)
			return -EOPNOTSUPP;
	}

508 509 510 511
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
512
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
513 514 515 516

	/*
	 * Count user and OS events unless requested not to
	 */
517 518 519 520
	if (!event->attr.exclude_user)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!event->attr.exclude_kernel)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
521

522 523
	if (event->attr.type == PERF_TYPE_RAW)
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
524

525
	return x86_setup_perfctr(event);
526 527
}

I
Ingo Molnar 已提交
528
/*
529
 * Setup the hardware configuration for a given attr_type
I
Ingo Molnar 已提交
530
 */
531
static int __hw_perf_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
532
{
P
Peter Zijlstra 已提交
533
	int err;
I
Ingo Molnar 已提交
534

535 536
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
537

P
Peter Zijlstra 已提交
538
	err = 0;
539
	if (!atomic_inc_not_zero(&active_events)) {
P
Peter Zijlstra 已提交
540
		mutex_lock(&pmc_reserve_mutex);
541
		if (atomic_read(&active_events) == 0) {
542 543
			if (!reserve_pmc_hardware())
				err = -EBUSY;
544
			else {
545
				err = reserve_ds_buffers();
546 547 548
				if (err)
					release_pmc_hardware();
			}
549 550
		}
		if (!err)
551
			atomic_inc(&active_events);
P
Peter Zijlstra 已提交
552 553 554 555 556
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

557
	event->destroy = hw_perf_event_destroy;
558

559 560 561
	event->hw.idx = -1;
	event->hw.last_cpu = -1;
	event->hw.last_tag = ~0ULL;
562

563
	return x86_pmu.hw_config(event);
564 565
}

566
static void x86_pmu_disable_all(void)
567
{
568
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
569 570
	int idx;

571
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
572 573
		u64 val;

574
		if (!test_bit(idx, cpuc->active_mask))
575
			continue;
576
		rdmsrl(x86_pmu.eventsel + idx, val);
577
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
578
			continue;
579
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
580
		wrmsrl(x86_pmu.eventsel + idx, val);
581 582 583
	}
}

584
void hw_perf_disable(void)
585
{
586 587
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

588
	if (!x86_pmu_initialized())
589
		return;
590

591 592 593 594 595 596
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
597 598

	x86_pmu.disable_all();
599
}
I
Ingo Molnar 已提交
600

601
static void x86_pmu_enable_all(int added)
602
{
603
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
604 605
	int idx;

606
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
607
		struct perf_event *event = cpuc->events[idx];
608
		u64 val;
609

610
		if (!test_bit(idx, cpuc->active_mask))
611
			continue;
612

613
		val = event->hw.config;
614
		val |= ARCH_PERFMON_EVENTSEL_ENABLE;
615
		wrmsrl(x86_pmu.eventsel + idx, val);
616 617 618
	}
}

619 620 621 622 623 624 625 626 627
static const struct pmu pmu;

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
628
	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
629
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
630
	int i, j, w, wmax, num = 0;
631 632 633 634 635
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

	for (i = 0; i < n; i++) {
636 637
		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
		constraints[i] = c;
638 639
	}

640 641 642
	/*
	 * fastpath, try to reuse previous register
	 */
643
	for (i = 0; i < n; i++) {
644
		hwc = &cpuc->event_list[i]->hw;
645
		c = constraints[i];
646 647 648 649 650 651

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
652
		if (!test_bit(hwc->idx, c->idxmsk))
653 654 655 656 657 658
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

P
Peter Zijlstra 已提交
659
		__set_bit(hwc->idx, used_mask);
660 661 662
		if (assign)
			assign[i] = hwc->idx;
	}
663
	if (i == n)
664 665 666 667 668 669 670 671
		goto done;

	/*
	 * begin slow path
	 */

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

672 673 674 675 676 677 678 679 680
	/*
	 * weight = number of possible counters
	 *
	 * 1    = most constrained, only works on one counter
	 * wmax = least constrained, works on any counter
	 *
	 * assign events to counters starting with most
	 * constrained events.
	 */
681
	wmax = x86_pmu.num_counters;
682 683 684 685 686 687

	/*
	 * when fixed event counters are present,
	 * wmax is incremented by 1 to account
	 * for one more choice
	 */
688
	if (x86_pmu.num_counters_fixed)
689 690
		wmax++;

691
	for (w = 1, num = n; num && w <= wmax; w++) {
692
		/* for each event */
693
		for (i = 0; num && i < n; i++) {
694
			c = constraints[i];
695 696
			hwc = &cpuc->event_list[i]->hw;

697
			if (c->weight != w)
698 699
				continue;

700
			for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
701 702 703 704 705 706 707
				if (!test_bit(j, used_mask))
					break;
			}

			if (j == X86_PMC_IDX_MAX)
				break;

P
Peter Zijlstra 已提交
708
			__set_bit(j, used_mask);
709

710 711 712 713 714
			if (assign)
				assign[i] = j;
			num--;
		}
	}
715
done:
716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737
	/*
	 * scheduling failed or is just a simulation,
	 * free resources if necessary
	 */
	if (!assign || num) {
		for (i = 0; i < n; i++) {
			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
		}
	}
	return num ? -ENOSPC : 0;
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

738
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
739 740 741 742 743 744 745 746 747 748 749 750 751 752 753

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
			return -ENOSPC;
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
754
		    event->state <= PERF_EVENT_STATE_OFF)
755 756 757 758 759 760 761 762 763 764 765 766
			continue;

		if (n >= max_count)
			return -ENOSPC;

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
767
				struct cpu_hw_events *cpuc, int i)
768
{
769 770 771 772 773
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791

	if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
		hwc->config_base = 0;
		hwc->event_base	= 0;
	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		/*
		 * We set it so that event_base + idx in wrmsr/rdmsr maps to
		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
		 */
		hwc->event_base =
			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
	} else {
		hwc->config_base = x86_pmu.eventsel;
		hwc->event_base  = x86_pmu.perfctr;
	}
}

792 793 794 795 796 797 798 799 800
static inline int match_prev_assignment(struct hw_perf_event *hwc,
					struct cpu_hw_events *cpuc,
					int i)
{
	return hwc->idx == cpuc->assign[i] &&
		hwc->last_cpu == smp_processor_id() &&
		hwc->last_tag == cpuc->tags[i];
}

P
Peter Zijlstra 已提交
801
static int x86_pmu_start(struct perf_event *event);
802
static void x86_pmu_stop(struct perf_event *event);
803

804
void hw_perf_enable(void)
805
{
806 807 808
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	struct perf_event *event;
	struct hw_perf_event *hwc;
809
	int i, added = cpuc->n_added;
810

811
	if (!x86_pmu_initialized())
812
		return;
813 814 815 816

	if (cpuc->enabled)
		return;

817
	if (cpuc->n_added) {
818
		int n_running = cpuc->n_events - cpuc->n_added;
819 820 821 822 823 824 825
		/*
		 * apply assignment obtained either from
		 * hw_perf_group_sched_in() or x86_pmu_enable()
		 *
		 * step1: save events moving to new counters
		 * step2: reprogram moved events into new counters
		 */
826
		for (i = 0; i < n_running; i++) {
827 828 829
			event = cpuc->event_list[i];
			hwc = &event->hw;

830 831 832 833 834 835 836 837
			/*
			 * we can avoid reprogramming counter if:
			 * - assigned same counter as last time
			 * - running on same CPU as last time
			 * - no other event has used the counter since
			 */
			if (hwc->idx == -1 ||
			    match_prev_assignment(hwc, cpuc, i))
838 839
				continue;

840
			x86_pmu_stop(event);
841 842 843 844 845 846
		}

		for (i = 0; i < cpuc->n_events; i++) {
			event = cpuc->event_list[i];
			hwc = &event->hw;

847
			if (!match_prev_assignment(hwc, cpuc, i))
848
				x86_assign_hw_event(event, cpuc, i);
849 850
			else if (i < n_running)
				continue;
851

P
Peter Zijlstra 已提交
852
			x86_pmu_start(event);
853 854 855 856
		}
		cpuc->n_added = 0;
		perf_events_lapic_init();
	}
857 858 859 860

	cpuc->enabled = 1;
	barrier();

861
	x86_pmu.enable_all(added);
862 863
}

864 865
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
					  u64 enable_mask)
866
{
867
	wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
868 869
}

870
static inline void x86_pmu_disable_event(struct perf_event *event)
871
{
872
	struct hw_perf_event *hwc = &event->hw;
873 874

	wrmsrl(hwc->config_base + hwc->idx, hwc->config);
875 876
}

877
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
I
Ingo Molnar 已提交
878

879 880
/*
 * Set the next IRQ period, based on the hwc->period_left value.
881
 * To be called with the event disabled in hw:
882
 */
883
static int
884
x86_perf_event_set_period(struct perf_event *event)
I
Ingo Molnar 已提交
885
{
886
	struct hw_perf_event *hwc = &event->hw;
887
	s64 left = atomic64_read(&hwc->period_left);
888
	s64 period = hwc->sample_period;
889
	int ret = 0, idx = hwc->idx;
890

891 892 893
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

894
	/*
895
	 * If we are way outside a reasonable range then just skip forward:
896 897 898 899
	 */
	if (unlikely(left <= -period)) {
		left = period;
		atomic64_set(&hwc->period_left, left);
900
		hwc->last_period = period;
901
		ret = 1;
902 903 904 905 906
	}

	if (unlikely(left <= 0)) {
		left += period;
		atomic64_set(&hwc->period_left, left);
907
		hwc->last_period = period;
908
		ret = 1;
909
	}
910
	/*
911
	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
912 913 914
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
915

916 917 918
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

919
	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
920 921

	/*
922
	 * The hw event starts counting from this event offset,
923 924
	 * mark it to be able to extra future deltas:
	 */
925
	atomic64_set(&hwc->prev_count, (u64)-left);
926

927
	wrmsrl(hwc->event_base + idx,
928
			(u64)(-left) & x86_pmu.cntval_mask);
929

930
	perf_event_update_userpage(event);
931

932
	return ret;
933 934
}

935
static void x86_pmu_enable_event(struct perf_event *event)
936
{
937
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
938
	if (cpuc->enabled)
939 940
		__x86_pmu_enable_event(&event->hw,
				       ARCH_PERFMON_EVENTSEL_ENABLE);
I
Ingo Molnar 已提交
941 942
}

943
/*
944 945 946 947 948 949 950
 * activate a single event
 *
 * The event is added to the group of enabled events
 * but only if it can be scehduled with existing events.
 *
 * Called with PMU disabled. If successful and return value 1,
 * then guaranteed to call perf_enable() and hw_perf_enable()
951 952 953 954
 */
static int x86_pmu_enable(struct perf_event *event)
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
955 956 957
	struct hw_perf_event *hwc;
	int assign[X86_PMC_IDX_MAX];
	int n, n0, ret;
958

959
	hwc = &event->hw;
960

961 962 963 964
	n0 = cpuc->n_events;
	n = collect_events(cpuc, event, false);
	if (n < 0)
		return n;
965

966 967 968 969 970 971 972 973
	/*
	 * If group events scheduling transaction was started,
	 * skip the schedulability test here, it will be peformed
	 * at commit time(->commit_txn) as a whole
	 */
	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
		goto out;

974
	ret = x86_pmu.schedule_events(cpuc, n, assign);
975 976 977 978 979 980 981
	if (ret)
		return ret;
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));
982

983
out:
984
	cpuc->n_events = n;
985
	cpuc->n_added += n - n0;
986 987

	return 0;
I
Ingo Molnar 已提交
988 989
}

990 991
static int x86_pmu_start(struct perf_event *event)
{
P
Peter Zijlstra 已提交
992 993 994 995
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int idx = event->hw.idx;

	if (idx == -1)
996 997
		return -EAGAIN;

998
	x86_perf_event_set_period(event);
P
Peter Zijlstra 已提交
999 1000
	cpuc->events[idx] = event;
	__set_bit(idx, cpuc->active_mask);
1001
	x86_pmu.enable(event);
P
Peter Zijlstra 已提交
1002
	perf_event_update_userpage(event);
1003 1004 1005 1006

	return 0;
}

1007
static void x86_pmu_unthrottle(struct perf_event *event)
1008
{
1009 1010
	int ret = x86_pmu_start(event);
	WARN_ON_ONCE(ret);
1011 1012
}

1013
void perf_event_print_debug(void)
I
Ingo Molnar 已提交
1014
{
1015
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1016
	u64 pebs;
1017
	struct cpu_hw_events *cpuc;
1018
	unsigned long flags;
1019 1020
	int cpu, idx;

1021
	if (!x86_pmu.num_counters)
1022
		return;
I
Ingo Molnar 已提交
1023

1024
	local_irq_save(flags);
I
Ingo Molnar 已提交
1025 1026

	cpu = smp_processor_id();
1027
	cpuc = &per_cpu(cpu_hw_events, cpu);
I
Ingo Molnar 已提交
1028

1029
	if (x86_pmu.version >= 2) {
1030 1031 1032 1033
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1034
		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1035 1036 1037 1038 1039 1040

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1041
		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1042
	}
1043
	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
I
Ingo Molnar 已提交
1044

1045
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1046 1047
		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
I
Ingo Molnar 已提交
1048

1049
		prev_left = per_cpu(pmc_prev_left[idx], cpu);
I
Ingo Molnar 已提交
1050

1051
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
1052
			cpu, idx, pmc_ctrl);
1053
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
1054
			cpu, idx, pmc_count);
1055
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1056
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
1057
	}
1058
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1059 1060
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

1061
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1062 1063
			cpu, idx, pmc_count);
	}
1064
	local_irq_restore(flags);
I
Ingo Molnar 已提交
1065 1066
}

1067
static void x86_pmu_stop(struct perf_event *event)
I
Ingo Molnar 已提交
1068
{
1069
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1070
	struct hw_perf_event *hwc = &event->hw;
1071
	int idx = hwc->idx;
I
Ingo Molnar 已提交
1072

1073 1074 1075
	if (!__test_and_clear_bit(idx, cpuc->active_mask))
		return;

1076
	x86_pmu.disable(event);
I
Ingo Molnar 已提交
1077

1078
	/*
1079
	 * Drain the remaining delta count out of a event
1080 1081
	 * that we are disabling:
	 */
1082
	x86_perf_event_update(event);
1083

1084
	cpuc->events[idx] = NULL;
1085 1086 1087 1088 1089 1090 1091
}

static void x86_pmu_disable(struct perf_event *event)
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int i;

1092
	x86_pmu_stop(event);
1093

1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
	for (i = 0; i < cpuc->n_events; i++) {
		if (event == cpuc->event_list[i]) {

			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, event);

			while (++i < cpuc->n_events)
				cpuc->event_list[i-1] = cpuc->event_list[i];

			--cpuc->n_events;
1104
			break;
1105 1106
		}
	}
1107
	perf_event_update_userpage(event);
I
Ingo Molnar 已提交
1108 1109
}

1110
static int x86_pmu_handle_irq(struct pt_regs *regs)
1111
{
1112
	struct perf_sample_data data;
1113 1114 1115
	struct cpu_hw_events *cpuc;
	struct perf_event *event;
	struct hw_perf_event *hwc;
V
Vince Weaver 已提交
1116
	int idx, handled = 0;
1117 1118
	u64 val;

1119
	perf_sample_data_init(&data, 0);
1120

1121
	cpuc = &__get_cpu_var(cpu_hw_events);
1122

1123
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1124
		if (!test_bit(idx, cpuc->active_mask))
1125
			continue;
1126

1127 1128
		event = cpuc->events[idx];
		hwc = &event->hw;
1129

1130
		val = x86_perf_event_update(event);
1131
		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1132
			continue;
1133

1134
		/*
1135
		 * event overflow
1136 1137
		 */
		handled		= 1;
1138
		data.period	= event->hw.last_period;
1139

1140
		if (!x86_perf_event_set_period(event))
1141 1142
			continue;

1143
		if (perf_event_overflow(event, 1, &data, regs))
1144
			x86_pmu_stop(event);
1145
	}
1146

1147 1148 1149
	if (handled)
		inc_irq_stat(apic_perf_irqs);

1150 1151
	return handled;
}
1152

1153 1154 1155 1156 1157
void smp_perf_pending_interrupt(struct pt_regs *regs)
{
	irq_enter();
	ack_APIC_irq();
	inc_irq_stat(apic_pending_irqs);
1158
	perf_event_do_pending();
1159 1160 1161
	irq_exit();
}

1162
void set_perf_event_pending(void)
1163
{
1164
#ifdef CONFIG_X86_LOCAL_APIC
1165 1166 1167
	if (!x86_pmu.apic || !x86_pmu_initialized())
		return;

1168
	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1169
#endif
1170 1171
}

1172
void perf_events_lapic_init(void)
I
Ingo Molnar 已提交
1173
{
1174
	if (!x86_pmu.apic || !x86_pmu_initialized())
I
Ingo Molnar 已提交
1175
		return;
1176

I
Ingo Molnar 已提交
1177
	/*
1178
	 * Always use NMI for PMU
I
Ingo Molnar 已提交
1179
	 */
1180
	apic_write(APIC_LVTPC, APIC_DM_NMI);
I
Ingo Molnar 已提交
1181 1182 1183
}

static int __kprobes
1184
perf_event_nmi_handler(struct notifier_block *self,
I
Ingo Molnar 已提交
1185 1186 1187 1188
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
	struct pt_regs *regs;
1189

1190
	if (!atomic_read(&active_events))
1191 1192
		return NOTIFY_DONE;

1193 1194 1195 1196
	switch (cmd) {
	case DIE_NMI:
	case DIE_NMI_IPI:
		break;
I
Ingo Molnar 已提交
1197

1198
	default:
I
Ingo Molnar 已提交
1199
		return NOTIFY_DONE;
1200
	}
I
Ingo Molnar 已提交
1201 1202 1203 1204

	regs = args->regs;

	apic_write(APIC_LVTPC, APIC_DM_NMI);
1205 1206
	/*
	 * Can't rely on the handled return value to say it was our NMI, two
1207
	 * events could trigger 'simultaneously' raising two back-to-back NMIs.
1208 1209 1210 1211
	 *
	 * If the first NMI handles both, the latter will be empty and daze
	 * the CPU.
	 */
1212
	x86_pmu.handle_irq(regs);
I
Ingo Molnar 已提交
1213

1214
	return NOTIFY_STOP;
I
Ingo Molnar 已提交
1215 1216
}

1217 1218 1219 1220 1221 1222
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
	.notifier_call		= perf_event_nmi_handler,
	.next			= NULL,
	.priority		= 1
};

1223
static struct event_constraint unconstrained;
1224
static struct event_constraint emptyconstraint;
1225 1226

static struct event_constraint *
1227
x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1228
{
1229
	struct event_constraint *c;
1230 1231 1232

	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1233 1234
			if ((event->hw.config & c->cmask) == c->code)
				return c;
1235 1236
		}
	}
1237 1238

	return &unconstrained;
1239 1240
}

1241 1242
#include "perf_event_amd.c"
#include "perf_event_p6.c"
1243
#include "perf_event_p4.c"
1244
#include "perf_event_intel_lbr.c"
1245
#include "perf_event_intel_ds.c"
1246
#include "perf_event_intel.c"
1247

1248 1249 1250 1251
static int __cpuinit
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;
1252
	int ret = NOTIFY_OK;
1253 1254 1255 1256

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
		if (x86_pmu.cpu_prepare)
1257
			ret = x86_pmu.cpu_prepare(cpu);
1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269
		break;

	case CPU_STARTING:
		if (x86_pmu.cpu_starting)
			x86_pmu.cpu_starting(cpu);
		break;

	case CPU_DYING:
		if (x86_pmu.cpu_dying)
			x86_pmu.cpu_dying(cpu);
		break;

1270
	case CPU_UP_CANCELED:
1271 1272 1273 1274 1275 1276 1277 1278 1279
	case CPU_DEAD:
		if (x86_pmu.cpu_dead)
			x86_pmu.cpu_dead(cpu);
		break;

	default:
		break;
	}

1280
	return ret;
1281 1282
}

1283 1284 1285 1286 1287 1288 1289 1290 1291 1292
static void __init pmu_check_apic(void)
{
	if (cpu_has_apic)
		return;

	x86_pmu.apic = 0;
	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
	pr_info("no hardware sampling interrupt available.\n");
}

1293
void __init init_hw_perf_events(void)
1294
{
1295
	struct event_constraint *c;
1296 1297
	int err;

1298
	pr_info("Performance Events: ");
1299

1300 1301
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1302
		err = intel_pmu_init();
1303
		break;
1304
	case X86_VENDOR_AMD:
1305
		err = amd_pmu_init();
1306
		break;
1307 1308
	default:
		return;
1309
	}
1310
	if (err != 0) {
1311
		pr_cont("no PMU driver, software events only.\n");
1312
		return;
1313
	}
1314

1315 1316
	pmu_check_apic();

1317
	pr_cont("%s PMU driver.\n", x86_pmu.name);
1318

1319 1320 1321
	if (x86_pmu.quirks)
		x86_pmu.quirks();

1322
	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1323
		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1324 1325
		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
1326
	}
1327 1328
	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
	perf_max_events = x86_pmu.num_counters;
I
Ingo Molnar 已提交
1329

1330
	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1331
		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1332 1333
		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1334
	}
1335

1336
	x86_pmu.intel_ctrl |=
1337
		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
1338

1339 1340
	perf_events_lapic_init();
	register_die_notifier(&perf_event_nmi_notifier);
1341

1342
	unconstrained = (struct event_constraint)
1343 1344
		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
				   0, x86_pmu.num_counters);
1345

1346 1347
	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1348
			if (c->cmask != X86_RAW_EVENT_MASK)
1349 1350
				continue;

1351 1352
			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
			c->weight += x86_pmu.num_counters;
1353 1354 1355
		}
	}

I
Ingo Molnar 已提交
1356
	pr_info("... version:                %d\n",     x86_pmu.version);
1357 1358 1359
	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
I
Ingo Molnar 已提交
1360
	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1361
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1362
	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1363 1364

	perf_cpu_notifier(x86_pmu_notifier);
I
Ingo Molnar 已提交
1365
}
I
Ingo Molnar 已提交
1366

1367
static inline void x86_pmu_read(struct perf_event *event)
1368
{
1369
	x86_perf_event_update(event);
1370 1371
}

1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424
/*
 * Start group events scheduling transaction
 * Set the flag to make pmu::enable() not perform the
 * schedulability test, it will be performed at commit time
 */
static void x86_pmu_start_txn(const struct pmu *pmu)
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
}

/*
 * Stop group events scheduling transaction
 * Clear the flag and pmu::enable() will perform the
 * schedulability test.
 */
static void x86_pmu_cancel_txn(const struct pmu *pmu)
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
}

/*
 * Commit group events scheduling transaction
 * Perform the group schedulability test as a whole
 * Return 0 if success
 */
static int x86_pmu_commit_txn(const struct pmu *pmu)
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int assign[X86_PMC_IDX_MAX];
	int n, ret;

	n = cpuc->n_events;

	if (!x86_pmu_initialized())
		return -EAGAIN;

	ret = x86_pmu.schedule_events(cpuc, n, assign);
	if (ret)
		return ret;

	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));

	return 0;
}

1425 1426 1427
static const struct pmu pmu = {
	.enable		= x86_pmu_enable,
	.disable	= x86_pmu_disable,
1428 1429
	.start		= x86_pmu_start,
	.stop		= x86_pmu_stop,
1430
	.read		= x86_pmu_read,
1431
	.unthrottle	= x86_pmu_unthrottle,
1432 1433 1434
	.start_txn	= x86_pmu_start_txn,
	.cancel_txn	= x86_pmu_cancel_txn,
	.commit_txn	= x86_pmu_commit_txn,
I
Ingo Molnar 已提交
1435 1436
};

1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462
/*
 * validate that we can schedule this event
 */
static int validate_event(struct perf_event *event)
{
	struct cpu_hw_events *fake_cpuc;
	struct event_constraint *c;
	int ret = 0;

	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
	if (!fake_cpuc)
		return -ENOMEM;

	c = x86_pmu.get_event_constraints(fake_cpuc, event);

	if (!c || !c->weight)
		ret = -ENOSPC;

	if (x86_pmu.put_event_constraints)
		x86_pmu.put_event_constraints(fake_cpuc, event);

	kfree(fake_cpuc);

	return ret;
}

1463 1464 1465 1466
/*
 * validate a single event group
 *
 * validation include:
1467 1468 1469
 *	- check events are compatible which each other
 *	- events do not compete for the same counter
 *	- number of events <= number of counters
1470 1471 1472 1473
 *
 * validation ensures the group can be loaded onto the
 * PMU if it was the only group available.
 */
1474 1475
static int validate_group(struct perf_event *event)
{
1476
	struct perf_event *leader = event->group_leader;
1477 1478
	struct cpu_hw_events *fake_cpuc;
	int ret, n;
1479

1480 1481 1482 1483
	ret = -ENOMEM;
	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
	if (!fake_cpuc)
		goto out;
1484

1485 1486 1487 1488 1489 1490
	/*
	 * the event is not yet connected with its
	 * siblings therefore we must first collect
	 * existing siblings, then add the new event
	 * before we can simulate the scheduling
	 */
1491 1492
	ret = -ENOSPC;
	n = collect_events(fake_cpuc, leader, true);
1493
	if (n < 0)
1494
		goto out_free;
1495

1496 1497
	fake_cpuc->n_events = n;
	n = collect_events(fake_cpuc, event, false);
1498
	if (n < 0)
1499
		goto out_free;
1500

1501
	fake_cpuc->n_events = n;
1502

1503
	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1504 1505 1506 1507 1508

out_free:
	kfree(fake_cpuc);
out:
	return ret;
1509 1510
}

1511
const struct pmu *hw_perf_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
1512
{
1513
	const struct pmu *tmp;
I
Ingo Molnar 已提交
1514 1515
	int err;

1516
	err = __hw_perf_event_init(event);
1517
	if (!err) {
1518 1519 1520 1521 1522 1523 1524 1525
		/*
		 * we temporarily connect event to its pmu
		 * such that validate_group() can classify
		 * it as an x86 event using is_x86_event()
		 */
		tmp = event->pmu;
		event->pmu = &pmu;

1526 1527
		if (event->group_leader != event)
			err = validate_group(event);
1528 1529
		else
			err = validate_event(event);
1530 1531

		event->pmu = tmp;
1532
	}
1533
	if (err) {
1534 1535
		if (event->destroy)
			event->destroy(event);
1536
		return ERR_PTR(err);
1537
	}
I
Ingo Molnar 已提交
1538

1539
	return &pmu;
I
Ingo Molnar 已提交
1540
}
1541 1542 1543 1544 1545 1546

/*
 * callchain support
 */

static inline
1547
void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1548
{
1549
	if (entry->nr < PERF_MAX_STACK_DEPTH)
1550 1551 1552
		entry->ip[entry->nr++] = ip;
}

1553 1554
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569


static void
backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
{
	/* Ignore warnings */
}

static void backtrace_warning(void *data, char *msg)
{
	/* Ignore warnings */
}

static int backtrace_stack(void *data, char *name)
{
1570
	return 0;
1571 1572 1573 1574 1575 1576
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

1577
	callchain_store(entry, addr);
1578 1579 1580 1581 1582 1583 1584
}

static const struct stacktrace_ops backtrace_ops = {
	.warning		= backtrace_warning,
	.warning_symbol		= backtrace_warning_symbol,
	.stack			= backtrace_stack,
	.address		= backtrace_address,
1585
	.walk_stack		= print_context_stack_bp,
1586 1587
};

1588 1589
#include "../dumpstack.h"

1590 1591 1592
static void
perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
1593
	callchain_store(entry, PERF_CONTEXT_KERNEL);
1594
	callchain_store(entry, regs->ip);
1595

1596
	dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
1597 1598
}

1599 1600 1601
#ifdef CONFIG_COMPAT
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1602
{
1603 1604 1605
	/* 32-bit process in 64-bit kernel. */
	struct stack_frame_ia32 frame;
	const void __user *fp;
1606

1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618
	if (!test_thread_flag(TIF_IA32))
		return 0;

	fp = compat_ptr(regs->bp);
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
		unsigned long bytes;
		frame.next_frame     = 0;
		frame.return_address = 0;

		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
			break;
1619

1620 1621
		if (fp < compat_ptr(regs->sp))
			break;
1622

1623 1624 1625 1626
		callchain_store(entry, frame.return_address);
		fp = compat_ptr(frame.next_frame);
	}
	return 1;
1627
}
1628 1629 1630 1631 1632 1633 1634
#else
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
    return 0;
}
#endif
1635 1636 1637 1638 1639 1640 1641

static void
perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	struct stack_frame frame;
	const void __user *fp;

1642 1643 1644
	if (!user_mode(regs))
		regs = task_pt_regs(current);

1645
	fp = (void __user *)regs->bp;
1646

1647
	callchain_store(entry, PERF_CONTEXT_USER);
1648 1649
	callchain_store(entry, regs->ip);

1650 1651 1652
	if (perf_callchain_user32(regs, entry))
		return;

1653
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
1654
		unsigned long bytes;
1655
		frame.next_frame	     = NULL;
1656 1657
		frame.return_address = 0;

1658 1659
		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
1660 1661
			break;

1662
		if ((unsigned long)fp < regs->sp)
1663 1664 1665
			break;

		callchain_store(entry, frame.return_address);
1666
		fp = frame.next_frame;
1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693
	}
}

static void
perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	int is_user;

	if (!regs)
		return;

	is_user = user_mode(regs);

	if (is_user && current->state != TASK_RUNNING)
		return;

	if (!is_user)
		perf_callchain_kernel(regs, entry);

	if (current->mm)
		perf_callchain_user(regs, entry);
}

struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
{
	struct perf_callchain_entry *entry;

1694 1695 1696 1697 1698
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
		return NULL;
	}

1699
	if (in_nmi())
1700
		entry = &__get_cpu_var(pmc_nmi_entry);
1701
	else
1702
		entry = &__get_cpu_var(pmc_irq_entry);
1703 1704 1705 1706 1707 1708 1709

	entry->nr = 0;

	perf_do_callchain(regs, entry);

	return entry;
}
1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721

void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
{
	regs->ip = ip;
	/*
	 * perf_arch_fetch_caller_regs adds another call, we need to increment
	 * the skip level
	 */
	regs->bp = rewind_frame_pointer(skip + 1);
	regs->cs = __KERNEL_CS;
	local_save_flags(regs->flags);
}
1722 1723 1724 1725

unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
	unsigned long ip;
1726

1727 1728 1729 1730
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
		ip = perf_guest_cbs->get_guest_ip();
	else
		ip = instruction_pointer(regs);
1731

1732 1733 1734 1735 1736 1737
	return ip;
}

unsigned long perf_misc_flags(struct pt_regs *regs)
{
	int misc = 0;
1738

1739
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750
		if (perf_guest_cbs->is_user_mode())
			misc |= PERF_RECORD_MISC_GUEST_USER;
		else
			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
	} else {
		if (user_mode(regs))
			misc |= PERF_RECORD_MISC_USER;
		else
			misc |= PERF_RECORD_MISC_KERNEL;
	}

1751
	if (regs->flags & PERF_EFLAGS_EXACT)
P
Peter Zijlstra 已提交
1752
		misc |= PERF_RECORD_MISC_EXACT_IP;
1753 1754 1755

	return misc;
}