perf_event.c 52.2 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2
 * Performance events x86 architecture code
I
Ingo Molnar 已提交
3
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
I
Ingo Molnar 已提交
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
I
Ingo Molnar 已提交
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
#include <linux/module.h>
I
Ingo Molnar 已提交
21 22
#include <linux/kdebug.h>
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/slab.h>
25
#include <linux/cpu.h>
26
#include <linux/bitops.h>
27
#include <linux/device.h>
I
Ingo Molnar 已提交
28 29

#include <asm/apic.h>
30
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
31
#include <asm/nmi.h>
32
#include <asm/smp.h>
33
#include <asm/alternative.h>
34
#include <asm/mmu_context.h>
A
Andy Lutomirski 已提交
35
#include <asm/tlbflush.h>
36
#include <asm/timer.h>
37 38
#include <asm/desc.h>
#include <asm/ldt.h>
I
Ingo Molnar 已提交
39

40 41 42
#include "perf_event.h"

struct x86_pmu x86_pmu __read_mostly;
43

44
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
45 46
	.enabled = 1,
};
I
Ingo Molnar 已提交
47

48 49
struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;

50
u64 __read_mostly hw_cache_event_ids
51 52 53
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
54
u64 __read_mostly hw_cache_extra_regs
55 56 57
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
58

59
/*
60 61
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
62 63
 * Returns the delta events processed.
 */
64
u64 x86_perf_event_update(struct perf_event *event)
65
{
66
	struct hw_perf_event *hwc = &event->hw;
67
	int shift = 64 - x86_pmu.cntval_bits;
68
	u64 prev_raw_count, new_raw_count;
69
	int idx = hwc->idx;
70
	s64 delta;
71

72
	if (idx == INTEL_PMC_IDX_FIXED_BTS)
73 74
		return 0;

75
	/*
76
	 * Careful: an NMI might modify the previous event value.
77 78 79
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
80
	 * count to the generic event atomically:
81 82
	 */
again:
83
	prev_raw_count = local64_read(&hwc->prev_count);
84
	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
85

86
	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
87 88 89 90 91 92
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
93
	 * (event-)time and add that to the generic event.
94 95
	 *
	 * Careful, not all hw sign-extends above the physical width
96
	 * of the count.
97
	 */
98 99
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
100

101 102
	local64_add(delta, &event->count);
	local64_sub(delta, &hwc->period_left);
103 104

	return new_raw_count;
105 106
}

107 108 109 110 111
/*
 * Find and validate any extra registers to set up.
 */
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
112
	struct hw_perf_event_extra *reg;
113 114
	struct extra_reg *er;

115
	reg = &event->hw.extra_reg;
116 117 118 119 120 121 122 123 124

	if (!x86_pmu.extra_regs)
		return 0;

	for (er = x86_pmu.extra_regs; er->msr; er++) {
		if (er->event != (config & er->config_mask))
			continue;
		if (event->attr.config1 & ~er->valid_mask)
			return -EINVAL;
125 126 127
		/* Check if the extra msrs can be safely accessed*/
		if (!er->extra_msr_access)
			return -ENXIO;
128 129 130 131

		reg->idx = er->idx;
		reg->config = event->attr.config1;
		reg->reg = er->msr;
132 133 134 135 136
		break;
	}
	return 0;
}

137
static atomic_t active_events;
P
Peter Zijlstra 已提交
138 139
static DEFINE_MUTEX(pmc_reserve_mutex);

140 141
#ifdef CONFIG_X86_LOCAL_APIC

P
Peter Zijlstra 已提交
142 143 144 145
static bool reserve_pmc_hardware(void)
{
	int i;

146
	for (i = 0; i < x86_pmu.num_counters; i++) {
147
		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
P
Peter Zijlstra 已提交
148 149 150
			goto perfctr_fail;
	}

151
	for (i = 0; i < x86_pmu.num_counters; i++) {
152
		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
P
Peter Zijlstra 已提交
153 154 155 156 157 158 159
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
160
		release_evntsel_nmi(x86_pmu_config_addr(i));
P
Peter Zijlstra 已提交
161

162
	i = x86_pmu.num_counters;
P
Peter Zijlstra 已提交
163 164 165

perfctr_fail:
	for (i--; i >= 0; i--)
166
		release_perfctr_nmi(x86_pmu_event_addr(i));
P
Peter Zijlstra 已提交
167 168 169 170 171 172 173 174

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

175
	for (i = 0; i < x86_pmu.num_counters; i++) {
176 177
		release_perfctr_nmi(x86_pmu_event_addr(i));
		release_evntsel_nmi(x86_pmu_config_addr(i));
P
Peter Zijlstra 已提交
178 179 180
	}
}

181 182 183 184 185 186 187
#else

static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}

#endif

188 189
static bool check_hw_exists(void)
{
190 191 192
	u64 val, val_fail, val_new= ~0;
	int i, reg, reg_fail, ret = 0;
	int bios_fail = 0;
193

194 195 196 197 198
	/*
	 * Check to see if the BIOS enabled any of the counters, if so
	 * complain and bail.
	 */
	for (i = 0; i < x86_pmu.num_counters; i++) {
199
		reg = x86_pmu_config_addr(i);
200 201 202
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
203 204 205 206 207
		if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
			bios_fail = 1;
			val_fail = val;
			reg_fail = reg;
		}
208 209 210 211 212 213 214 215
	}

	if (x86_pmu.num_counters_fixed) {
		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
216 217 218 219 220
			if (val & (0x03 << i*4)) {
				bios_fail = 1;
				val_fail = val;
				reg_fail = reg;
			}
221 222 223 224
		}
	}

	/*
225 226 227
	 * Read the current value, change it and read it back to see if it
	 * matches, this is needed to detect certain hardware emulators
	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
228
	 */
229
	reg = x86_pmu_event_addr(0);
230 231 232
	if (rdmsrl_safe(reg, &val))
		goto msr_fail;
	val ^= 0xffffUL;
233 234
	ret = wrmsrl_safe(reg, val);
	ret |= rdmsrl_safe(reg, &val_new);
235
	if (ret || val != val_new)
236
		goto msr_fail;
237

238 239 240
	/*
	 * We still allow the PMU driver to operate:
	 */
241 242 243 244
	if (bios_fail) {
		printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
		printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail);
	}
245 246

	return true;
247 248 249

msr_fail:
	printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
250 251 252
	printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
		boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
		reg, val_new);
253

254
	return false;
255 256
}

257
static void hw_perf_event_destroy(struct perf_event *event)
P
Peter Zijlstra 已提交
258
{
259
	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
260
		release_pmc_hardware();
261
		release_ds_buffers();
P
Peter Zijlstra 已提交
262 263 264 265
		mutex_unlock(&pmc_reserve_mutex);
	}
}

266 267 268 269 270 271 272 273
void hw_perf_lbr_event_destroy(struct perf_event *event)
{
	hw_perf_event_destroy(event);

	/* undo the lbr/bts event accounting */
	x86_del_exclusive(x86_lbr_exclusive_lbr);
}

274 275 276 277 278
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

279
static inline int
280
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
281
{
282
	struct perf_event_attr *attr = &event->attr;
283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;
309 310
	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
	return x86_pmu_extra_regs(val, event);
311 312
}

313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
/*
 * Check if we can create event of a certain type (that no conflicting events
 * are present).
 */
int x86_add_exclusive(unsigned int what)
{
	int ret = -EBUSY, i;

	if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
		return 0;

	mutex_lock(&pmc_reserve_mutex);
	for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
		if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
			goto out;

	atomic_inc(&x86_pmu.lbr_exclusive[what]);
	ret = 0;

out:
	mutex_unlock(&pmc_reserve_mutex);
	return ret;
}

void x86_del_exclusive(unsigned int what)
{
	atomic_dec(&x86_pmu.lbr_exclusive[what]);
}

342
int x86_setup_perfctr(struct perf_event *event)
343 344 345 346 347
{
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
	u64 config;

348
	if (!is_sampling_event(event)) {
349 350
		hwc->sample_period = x86_pmu.max_period;
		hwc->last_period = hwc->sample_period;
351
		local64_set(&hwc->period_left, hwc->sample_period);
352 353 354
	}

	if (attr->type == PERF_TYPE_RAW)
355
		return x86_pmu_extra_regs(event->attr.config, event);
356 357

	if (attr->type == PERF_TYPE_HW_CACHE)
358
		return set_ext_hw_attr(hwc, event);
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;

	/*
	 * The generic map:
	 */
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

	/*
	 * Branch tracing:
	 */
P
Peter Zijlstra 已提交
377 378
	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
	    !attr->freq && hwc->sample_period == 1) {
379
		/* BTS is not supported by this architecture. */
380
		if (!x86_pmu.bts_active)
381 382 383 384 385
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (!attr->exclude_kernel)
			return -EOPNOTSUPP;
386 387 388 389 390 391

		/* disallow bts if conflicting events are present */
		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
			return -EBUSY;

		event->destroy = hw_perf_lbr_event_destroy;
392 393 394 395 396 397
	}

	hwc->config |= config;

	return 0;
}
398

399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
/*
 * check that branch_sample_type is compatible with
 * settings needed for precise_ip > 1 which implies
 * using the LBR to capture ALL taken branches at the
 * priv levels of the measurement
 */
static inline int precise_br_compat(struct perf_event *event)
{
	u64 m = event->attr.branch_sample_type;
	u64 b = 0;

	/* must capture all branches */
	if (!(m & PERF_SAMPLE_BRANCH_ANY))
		return 0;

	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_user)
		b |= PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_kernel)
		b |= PERF_SAMPLE_BRANCH_KERNEL;

	/*
	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
	 */

	return m == b;
}

429
int x86_pmu_hw_config(struct perf_event *event)
430
{
P
Peter Zijlstra 已提交
431 432 433 434
	if (event->attr.precise_ip) {
		int precise = 0;

		/* Support for constant skid */
435
		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
P
Peter Zijlstra 已提交
436 437
			precise++;

438
			/* Support for IP fixup */
A
Andi Kleen 已提交
439
			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
440 441
				precise++;
		}
P
Peter Zijlstra 已提交
442 443 444

		if (event->attr.precise_ip > precise)
			return -EOPNOTSUPP;
445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473
	}
	/*
	 * check that PEBS LBR correction does not conflict with
	 * whatever the user is asking with attr->branch_sample_type
	 */
	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
		u64 *br_type = &event->attr.branch_sample_type;

		if (has_branch_stack(event)) {
			if (!precise_br_compat(event))
				return -EOPNOTSUPP;

			/* branch_sample_type is compatible */

		} else {
			/*
			 * user did not specify  branch_sample_type
			 *
			 * For PEBS fixups, we capture all
			 * the branches at the priv level of the
			 * event.
			 */
			*br_type = PERF_SAMPLE_BRANCH_ANY;

			if (!event->attr.exclude_user)
				*br_type |= PERF_SAMPLE_BRANCH_USER;

			if (!event->attr.exclude_kernel)
				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
474
		}
P
Peter Zijlstra 已提交
475 476
	}

477 478 479
	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
		event->attach_state |= PERF_ATTACH_TASK_DATA;

480 481 482 483
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
484
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
485 486 487 488

	/*
	 * Count user and OS events unless requested not to
	 */
489 490 491 492
	if (!event->attr.exclude_user)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!event->attr.exclude_kernel)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
493

494 495
	if (event->attr.type == PERF_TYPE_RAW)
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
496

497 498 499 500 501 502
	if (event->attr.sample_period && x86_pmu.limit_period) {
		if (x86_pmu.limit_period(event, event->attr.sample_period) >
				event->attr.sample_period)
			return -EINVAL;
	}

503
	return x86_setup_perfctr(event);
504 505
}

I
Ingo Molnar 已提交
506
/*
507
 * Setup the hardware configuration for a given attr_type
I
Ingo Molnar 已提交
508
 */
509
static int __x86_pmu_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
510
{
P
Peter Zijlstra 已提交
511
	int err;
I
Ingo Molnar 已提交
512

513 514
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
515

P
Peter Zijlstra 已提交
516
	err = 0;
517
	if (!atomic_inc_not_zero(&active_events)) {
P
Peter Zijlstra 已提交
518
		mutex_lock(&pmc_reserve_mutex);
519
		if (atomic_read(&active_events) == 0) {
520 521
			if (!reserve_pmc_hardware())
				err = -EBUSY;
522 523
			else
				reserve_ds_buffers();
524 525
		}
		if (!err)
526
			atomic_inc(&active_events);
P
Peter Zijlstra 已提交
527 528 529 530 531
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

532
	event->destroy = hw_perf_event_destroy;
533

534 535 536
	event->hw.idx = -1;
	event->hw.last_cpu = -1;
	event->hw.last_tag = ~0ULL;
537

538 539
	/* mark unused */
	event->hw.extra_reg.idx = EXTRA_REG_NONE;
540 541
	event->hw.branch_reg.idx = EXTRA_REG_NONE;

542
	return x86_pmu.hw_config(event);
543 544
}

545
void x86_pmu_disable_all(void)
546
{
547
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
548 549
	int idx;

550
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
551 552
		u64 val;

553
		if (!test_bit(idx, cpuc->active_mask))
554
			continue;
555
		rdmsrl(x86_pmu_config_addr(idx), val);
556
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
557
			continue;
558
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
559
		wrmsrl(x86_pmu_config_addr(idx), val);
560 561 562
	}
}

P
Peter Zijlstra 已提交
563
static void x86_pmu_disable(struct pmu *pmu)
564
{
565
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
566

567
	if (!x86_pmu_initialized())
568
		return;
569

570 571 572 573 574 575
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
576 577

	x86_pmu.disable_all();
578
}
I
Ingo Molnar 已提交
579

580
void x86_pmu_enable_all(int added)
581
{
582
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
583 584
	int idx;

585
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
586
		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
587

588
		if (!test_bit(idx, cpuc->active_mask))
589
			continue;
590

591
		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
592 593 594
	}
}

P
Peter Zijlstra 已提交
595
static struct pmu pmu;
596 597 598 599 600 601

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
/*
 * Event scheduler state:
 *
 * Assign events iterating over all events and counters, beginning
 * with events with least weights first. Keep the current iterator
 * state in struct sched_state.
 */
struct sched_state {
	int	weight;
	int	event;		/* event index */
	int	counter;	/* counter index */
	int	unassigned;	/* number of events to be assigned left */
	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};

617 618 619
/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
#define	SCHED_STATES_MAX	2

620 621 622
struct perf_sched {
	int			max_weight;
	int			max_events;
623
	struct perf_event	**events;
624
	struct sched_state	state;
625 626
	int			saved_states;
	struct sched_state	saved[SCHED_STATES_MAX];
627 628 629 630 631
};

/*
 * Initialize interator that runs through all events and counters.
 */
632
static void perf_sched_init(struct perf_sched *sched, struct perf_event **events,
633 634 635 636 637 638 639
			    int num, int wmin, int wmax)
{
	int idx;

	memset(sched, 0, sizeof(*sched));
	sched->max_events	= num;
	sched->max_weight	= wmax;
640
	sched->events		= events;
641 642

	for (idx = 0; idx < num; idx++) {
643
		if (events[idx]->hw.constraint->weight == wmin)
644 645 646 647 648 649 650 651
			break;
	}

	sched->state.event	= idx;		/* start with min weight */
	sched->state.weight	= wmin;
	sched->state.unassigned	= num;
}

652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
static void perf_sched_save_state(struct perf_sched *sched)
{
	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
		return;

	sched->saved[sched->saved_states] = sched->state;
	sched->saved_states++;
}

static bool perf_sched_restore_state(struct perf_sched *sched)
{
	if (!sched->saved_states)
		return false;

	sched->saved_states--;
	sched->state = sched->saved[sched->saved_states];

	/* continue with next counter: */
	clear_bit(sched->state.counter++, sched->state.used);

	return true;
}

675 676 677 678
/*
 * Select a counter for the current event to schedule. Return true on
 * success.
 */
679
static bool __perf_sched_find_counter(struct perf_sched *sched)
680 681 682 683 684 685 686 687 688 689
{
	struct event_constraint *c;
	int idx;

	if (!sched->state.unassigned)
		return false;

	if (sched->state.event >= sched->max_events)
		return false;

690
	c = sched->events[sched->state.event]->hw.constraint;
691
	/* Prefer fixed purpose counters */
692 693
	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
		idx = INTEL_PMC_IDX_FIXED;
694
		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
695 696 697 698
			if (!__test_and_set_bit(idx, sched->state.used))
				goto done;
		}
	}
699 700
	/* Grab the first unused counter starting with idx */
	idx = sched->state.counter;
701
	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
702
		if (!__test_and_set_bit(idx, sched->state.used))
703
			goto done;
704 705
	}

706 707 708 709
	return false;

done:
	sched->state.counter = idx;
710

711 712 713 714 715 716 717 718 719 720 721 722 723
	if (c->overlap)
		perf_sched_save_state(sched);

	return true;
}

static bool perf_sched_find_counter(struct perf_sched *sched)
{
	while (!__perf_sched_find_counter(sched)) {
		if (!perf_sched_restore_state(sched))
			return false;
	}

724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747
	return true;
}

/*
 * Go through all unassigned events and find the next one to schedule.
 * Take events with the least weight first. Return true on success.
 */
static bool perf_sched_next_event(struct perf_sched *sched)
{
	struct event_constraint *c;

	if (!sched->state.unassigned || !--sched->state.unassigned)
		return false;

	do {
		/* next event */
		sched->state.event++;
		if (sched->state.event >= sched->max_events) {
			/* next weight */
			sched->state.event = 0;
			sched->state.weight++;
			if (sched->state.weight > sched->max_weight)
				return false;
		}
748
		c = sched->events[sched->state.event]->hw.constraint;
749 750 751 752 753 754 755 756 757 758
	} while (c->weight != sched->state.weight);

	sched->state.counter = 0;	/* start with first counter */

	return true;
}

/*
 * Assign a counter for each event.
 */
759
int perf_assign_events(struct perf_event **events, int n,
Y
Yan, Zheng 已提交
760
			int wmin, int wmax, int *assign)
761 762 763
{
	struct perf_sched sched;

764
	perf_sched_init(&sched, events, n, wmin, wmax);
765 766 767 768 769 770 771 772 773 774

	do {
		if (!perf_sched_find_counter(&sched))
			break;	/* failed */
		if (assign)
			assign[sched.state.event] = sched.state.counter;
	} while (perf_sched_next_event(&sched));

	return sched.state.unassigned;
}
775
EXPORT_SYMBOL_GPL(perf_assign_events);
776

777
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
778
{
779
	struct event_constraint *c;
780
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
781
	struct perf_event *e;
782
	int i, wmin, wmax, num = 0;
783 784 785 786
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

787 788 789
	if (x86_pmu.start_scheduling)
		x86_pmu.start_scheduling(cpuc);

790
	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
791
		hwc = &cpuc->event_list[i]->hw;
792
		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
793 794
		hwc->constraint = c;

795 796
		wmin = min(wmin, c->weight);
		wmax = max(wmax, c->weight);
797 798
	}

799 800 801
	/*
	 * fastpath, try to reuse previous register
	 */
802
	for (i = 0; i < n; i++) {
803
		hwc = &cpuc->event_list[i]->hw;
804
		c = hwc->constraint;
805 806 807 808 809 810

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
811
		if (!test_bit(hwc->idx, c->idxmsk))
812 813 814 815 816 817
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

P
Peter Zijlstra 已提交
818
		__set_bit(hwc->idx, used_mask);
819 820 821 822
		if (assign)
			assign[i] = hwc->idx;
	}

823 824
	/* slow path */
	if (i != n)
825 826
		num = perf_assign_events(cpuc->event_list, n, wmin,
					 wmax, assign);
827

828 829 830 831 832 833 834 835
	/*
	 * Mark the event as committed, so we do not put_constraint()
	 * in case new events are added and fail scheduling.
	 */
	if (!num && assign) {
		for (i = 0; i < n; i++) {
			e = cpuc->event_list[i];
			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
836 837
			if (x86_pmu.commit_scheduling)
				x86_pmu.commit_scheduling(cpuc, e, assign[i]);
838 839
		}
	}
840 841 842 843 844 845
	/*
	 * scheduling failed or is just a simulation,
	 * free resources if necessary
	 */
	if (!assign || num) {
		for (i = 0; i < n; i++) {
846 847 848 849 850 851 852 853
			e = cpuc->event_list[i];
			/*
			 * do not put_constraint() on comitted events,
			 * because they are good to go
			 */
			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
				continue;

854
			if (x86_pmu.put_event_constraints)
855
				x86_pmu.put_event_constraints(cpuc, e);
856 857
		}
	}
858 859 860 861

	if (x86_pmu.stop_scheduling)
		x86_pmu.stop_scheduling(cpuc);

862
	return num ? -EINVAL : 0;
863 864 865 866 867 868 869 870 871 872 873
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

874
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
875 876 877 878 879 880

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
881
			return -EINVAL;
882 883 884 885 886 887 888 889
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
890
		    event->state <= PERF_EVENT_STATE_OFF)
891 892 893
			continue;

		if (n >= max_count)
894
			return -EINVAL;
895 896 897 898 899 900 901 902

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
903
				struct cpu_hw_events *cpuc, int i)
904
{
905 906 907 908 909
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
910

911
	if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
912 913
		hwc->config_base = 0;
		hwc->event_base	= 0;
914
	} else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
915
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
916 917
		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
		hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
918
	} else {
919 920
		hwc->config_base = x86_pmu_config_addr(hwc->idx);
		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
921
		hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
922 923 924
	}
}

925 926 927 928 929 930 931 932 933
static inline int match_prev_assignment(struct hw_perf_event *hwc,
					struct cpu_hw_events *cpuc,
					int i)
{
	return hwc->idx == cpuc->assign[i] &&
		hwc->last_cpu == smp_processor_id() &&
		hwc->last_tag == cpuc->tags[i];
}

P
Peter Zijlstra 已提交
934
static void x86_pmu_start(struct perf_event *event, int flags);
935

P
Peter Zijlstra 已提交
936
static void x86_pmu_enable(struct pmu *pmu)
937
{
938
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
939 940
	struct perf_event *event;
	struct hw_perf_event *hwc;
941
	int i, added = cpuc->n_added;
942

943
	if (!x86_pmu_initialized())
944
		return;
945 946 947 948

	if (cpuc->enabled)
		return;

949
	if (cpuc->n_added) {
950
		int n_running = cpuc->n_events - cpuc->n_added;
951 952 953 954 955 956
		/*
		 * apply assignment obtained either from
		 * hw_perf_group_sched_in() or x86_pmu_enable()
		 *
		 * step1: save events moving to new counters
		 */
957
		for (i = 0; i < n_running; i++) {
958 959 960
			event = cpuc->event_list[i];
			hwc = &event->hw;

961 962 963 964 965 966 967 968
			/*
			 * we can avoid reprogramming counter if:
			 * - assigned same counter as last time
			 * - running on same CPU as last time
			 * - no other event has used the counter since
			 */
			if (hwc->idx == -1 ||
			    match_prev_assignment(hwc, cpuc, i))
969 970
				continue;

P
Peter Zijlstra 已提交
971 972 973 974 975 976 977 978
			/*
			 * Ensure we don't accidentally enable a stopped
			 * counter simply because we rescheduled.
			 */
			if (hwc->state & PERF_HES_STOPPED)
				hwc->state |= PERF_HES_ARCH;

			x86_pmu_stop(event, PERF_EF_UPDATE);
979 980
		}

981 982 983
		/*
		 * step2: reprogram moved events into new counters
		 */
984 985 986 987
		for (i = 0; i < cpuc->n_events; i++) {
			event = cpuc->event_list[i];
			hwc = &event->hw;

988
			if (!match_prev_assignment(hwc, cpuc, i))
989
				x86_assign_hw_event(event, cpuc, i);
990 991
			else if (i < n_running)
				continue;
992

P
Peter Zijlstra 已提交
993 994 995 996
			if (hwc->state & PERF_HES_ARCH)
				continue;

			x86_pmu_start(event, PERF_EF_RELOAD);
997 998 999 1000
		}
		cpuc->n_added = 0;
		perf_events_lapic_init();
	}
1001 1002 1003 1004

	cpuc->enabled = 1;
	barrier();

1005
	x86_pmu.enable_all(added);
1006 1007
}

1008
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
I
Ingo Molnar 已提交
1009

1010 1011
/*
 * Set the next IRQ period, based on the hwc->period_left value.
1012
 * To be called with the event disabled in hw:
1013
 */
1014
int x86_perf_event_set_period(struct perf_event *event)
I
Ingo Molnar 已提交
1015
{
1016
	struct hw_perf_event *hwc = &event->hw;
1017
	s64 left = local64_read(&hwc->period_left);
1018
	s64 period = hwc->sample_period;
1019
	int ret = 0, idx = hwc->idx;
1020

1021
	if (idx == INTEL_PMC_IDX_FIXED_BTS)
1022 1023
		return 0;

1024
	/*
1025
	 * If we are way outside a reasonable range then just skip forward:
1026 1027 1028
	 */
	if (unlikely(left <= -period)) {
		left = period;
1029
		local64_set(&hwc->period_left, left);
1030
		hwc->last_period = period;
1031
		ret = 1;
1032 1033 1034 1035
	}

	if (unlikely(left <= 0)) {
		left += period;
1036
		local64_set(&hwc->period_left, left);
1037
		hwc->last_period = period;
1038
		ret = 1;
1039
	}
1040
	/*
1041
	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1042 1043 1044
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
1045

1046 1047 1048
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

1049 1050 1051
	if (x86_pmu.limit_period)
		left = x86_pmu.limit_period(event, left);

1052
	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1053 1054

	/*
1055
	 * The hw event starts counting from this event offset,
1056 1057
	 * mark it to be able to extra future deltas:
	 */
1058
	local64_set(&hwc->prev_count, (u64)-left);
1059

1060
	wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
1061 1062 1063 1064 1065 1066 1067

	/*
	 * Due to erratum on certan cpu we need
	 * a second write to be sure the register
	 * is updated properly
	 */
	if (x86_pmu.perfctr_second_write) {
1068
		wrmsrl(hwc->event_base,
1069
			(u64)(-left) & x86_pmu.cntval_mask);
1070
	}
1071

1072
	perf_event_update_userpage(event);
1073

1074
	return ret;
1075 1076
}

1077
void x86_pmu_enable_event(struct perf_event *event)
1078
{
T
Tejun Heo 已提交
1079
	if (__this_cpu_read(cpu_hw_events.enabled))
1080 1081
		__x86_pmu_enable_event(&event->hw,
				       ARCH_PERFMON_EVENTSEL_ENABLE);
I
Ingo Molnar 已提交
1082 1083
}

1084
/*
P
Peter Zijlstra 已提交
1085
 * Add a single event to the PMU.
1086 1087 1088
 *
 * The event is added to the group of enabled events
 * but only if it can be scehduled with existing events.
1089
 */
P
Peter Zijlstra 已提交
1090
static int x86_pmu_add(struct perf_event *event, int flags)
1091
{
1092
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1093 1094 1095
	struct hw_perf_event *hwc;
	int assign[X86_PMC_IDX_MAX];
	int n, n0, ret;
1096

1097
	hwc = &event->hw;
1098

1099
	n0 = cpuc->n_events;
1100 1101 1102
	ret = n = collect_events(cpuc, event, false);
	if (ret < 0)
		goto out;
1103

P
Peter Zijlstra 已提交
1104 1105 1106 1107
	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
	if (!(flags & PERF_EF_START))
		hwc->state |= PERF_HES_ARCH;

1108 1109
	/*
	 * If group events scheduling transaction was started,
L
Lucas De Marchi 已提交
1110
	 * skip the schedulability test here, it will be performed
1111
	 * at commit time (->commit_txn) as a whole.
1112
	 */
1113
	if (cpuc->group_flag & PERF_EVENT_TXN)
1114
		goto done_collect;
1115

1116
	ret = x86_pmu.schedule_events(cpuc, n, assign);
1117
	if (ret)
1118
		goto out;
1119 1120 1121 1122 1123
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));
1124

1125
done_collect:
1126 1127 1128 1129
	/*
	 * Commit the collect_events() state. See x86_pmu_del() and
	 * x86_pmu_*_txn().
	 */
1130
	cpuc->n_events = n;
1131
	cpuc->n_added += n - n0;
1132
	cpuc->n_txn += n - n0;
1133

1134 1135 1136
	ret = 0;
out:
	return ret;
I
Ingo Molnar 已提交
1137 1138
}

P
Peter Zijlstra 已提交
1139
static void x86_pmu_start(struct perf_event *event, int flags)
1140
{
1141
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
P
Peter Zijlstra 已提交
1142 1143
	int idx = event->hw.idx;

P
Peter Zijlstra 已提交
1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155
	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
		return;

	if (WARN_ON_ONCE(idx == -1))
		return;

	if (flags & PERF_EF_RELOAD) {
		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
		x86_perf_event_set_period(event);
	}

	event->hw.state = 0;
1156

P
Peter Zijlstra 已提交
1157 1158
	cpuc->events[idx] = event;
	__set_bit(idx, cpuc->active_mask);
1159
	__set_bit(idx, cpuc->running);
1160
	x86_pmu.enable(event);
P
Peter Zijlstra 已提交
1161
	perf_event_update_userpage(event);
1162 1163
}

1164
void perf_event_print_debug(void)
I
Ingo Molnar 已提交
1165
{
1166
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1167
	u64 pebs;
1168
	struct cpu_hw_events *cpuc;
1169
	unsigned long flags;
1170 1171
	int cpu, idx;

1172
	if (!x86_pmu.num_counters)
1173
		return;
I
Ingo Molnar 已提交
1174

1175
	local_irq_save(flags);
I
Ingo Molnar 已提交
1176 1177

	cpu = smp_processor_id();
1178
	cpuc = &per_cpu(cpu_hw_events, cpu);
I
Ingo Molnar 已提交
1179

1180
	if (x86_pmu.version >= 2) {
1181 1182 1183 1184
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1185
		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1186 1187 1188 1189 1190 1191

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1192
		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1193
	}
1194
	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
I
Ingo Molnar 已提交
1195

1196
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1197 1198
		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
I
Ingo Molnar 已提交
1199

1200
		prev_left = per_cpu(pmc_prev_left[idx], cpu);
I
Ingo Molnar 已提交
1201

1202
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
1203
			cpu, idx, pmc_ctrl);
1204
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
1205
			cpu, idx, pmc_count);
1206
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1207
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
1208
	}
1209
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1210 1211
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

1212
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1213 1214
			cpu, idx, pmc_count);
	}
1215
	local_irq_restore(flags);
I
Ingo Molnar 已提交
1216 1217
}

1218
void x86_pmu_stop(struct perf_event *event, int flags)
I
Ingo Molnar 已提交
1219
{
1220
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1221
	struct hw_perf_event *hwc = &event->hw;
I
Ingo Molnar 已提交
1222

P
Peter Zijlstra 已提交
1223 1224 1225 1226 1227 1228
	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
		x86_pmu.disable(event);
		cpuc->events[hwc->idx] = NULL;
		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
		hwc->state |= PERF_HES_STOPPED;
	}
1229

P
Peter Zijlstra 已提交
1230 1231 1232 1233 1234 1235 1236 1237
	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
		/*
		 * Drain the remaining delta count out of a event
		 * that we are disabling:
		 */
		x86_perf_event_update(event);
		hwc->state |= PERF_HES_UPTODATE;
	}
1238 1239
}

P
Peter Zijlstra 已提交
1240
static void x86_pmu_del(struct perf_event *event, int flags)
1241
{
1242
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1243 1244
	int i;

1245 1246 1247 1248 1249
	/*
	 * event is descheduled
	 */
	event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;

1250 1251 1252 1253
	/*
	 * If we're called during a txn, we don't need to do anything.
	 * The events never got scheduled and ->cancel_txn will truncate
	 * the event_list.
1254 1255 1256
	 *
	 * XXX assumes any ->del() called during a TXN will only be on
	 * an event added during that same TXN.
1257
	 */
1258
	if (cpuc->group_flag & PERF_EVENT_TXN)
1259 1260
		return;

1261 1262 1263
	/*
	 * Not a TXN, therefore cleanup properly.
	 */
P
Peter Zijlstra 已提交
1264
	x86_pmu_stop(event, PERF_EF_UPDATE);
1265

1266
	for (i = 0; i < cpuc->n_events; i++) {
1267 1268 1269
		if (event == cpuc->event_list[i])
			break;
	}
1270

1271 1272
	if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
		return;
P
Peter Zijlstra 已提交
1273

1274 1275 1276
	/* If we have a newly added event; make sure to decrease n_added. */
	if (i >= cpuc->n_events - cpuc->n_added)
		--cpuc->n_added;
1277

1278 1279 1280 1281 1282 1283 1284
	if (x86_pmu.put_event_constraints)
		x86_pmu.put_event_constraints(cpuc, event);

	/* Delete the array entry. */
	while (++i < cpuc->n_events)
		cpuc->event_list[i-1] = cpuc->event_list[i];
	--cpuc->n_events;
1285

1286
	perf_event_update_userpage(event);
I
Ingo Molnar 已提交
1287 1288
}

1289
int x86_pmu_handle_irq(struct pt_regs *regs)
1290
{
1291
	struct perf_sample_data data;
1292 1293
	struct cpu_hw_events *cpuc;
	struct perf_event *event;
V
Vince Weaver 已提交
1294
	int idx, handled = 0;
1295 1296
	u64 val;

1297
	cpuc = this_cpu_ptr(&cpu_hw_events);
1298

1299 1300 1301 1302 1303 1304 1305 1306 1307 1308
	/*
	 * Some chipsets need to unmask the LVTPC in a particular spot
	 * inside the nmi handler.  As a result, the unmasking was pushed
	 * into all the nmi handlers.
	 *
	 * This generic handler doesn't seem to have any issues where the
	 * unmasking occurs so it was left at the top.
	 */
	apic_write(APIC_LVTPC, APIC_DM_NMI);

1309
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1310 1311 1312 1313 1314 1315 1316 1317
		if (!test_bit(idx, cpuc->active_mask)) {
			/*
			 * Though we deactivated the counter some cpus
			 * might still deliver spurious interrupts still
			 * in flight. Catch them:
			 */
			if (__test_and_clear_bit(idx, cpuc->running))
				handled++;
1318
			continue;
1319
		}
1320

1321
		event = cpuc->events[idx];
1322

1323
		val = x86_perf_event_update(event);
1324
		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1325
			continue;
1326

1327
		/*
1328
		 * event overflow
1329
		 */
1330
		handled++;
1331
		perf_sample_data_init(&data, 0, event->hw.last_period);
1332

1333
		if (!x86_perf_event_set_period(event))
1334 1335
			continue;

1336
		if (perf_event_overflow(event, &data, regs))
P
Peter Zijlstra 已提交
1337
			x86_pmu_stop(event, 0);
1338
	}
1339

1340 1341 1342
	if (handled)
		inc_irq_stat(apic_perf_irqs);

1343 1344
	return handled;
}
1345

1346
void perf_events_lapic_init(void)
I
Ingo Molnar 已提交
1347
{
1348
	if (!x86_pmu.apic || !x86_pmu_initialized())
I
Ingo Molnar 已提交
1349
		return;
1350

I
Ingo Molnar 已提交
1351
	/*
1352
	 * Always use NMI for PMU
I
Ingo Molnar 已提交
1353
	 */
1354
	apic_write(APIC_LVTPC, APIC_DM_NMI);
I
Ingo Molnar 已提交
1355 1356
}

1357
static int
1358
perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
I
Ingo Molnar 已提交
1359
{
1360 1361
	u64 start_clock;
	u64 finish_clock;
P
Peter Zijlstra 已提交
1362
	int ret;
1363

1364
	if (!atomic_read(&active_events))
1365
		return NMI_DONE;
1366

P
Peter Zijlstra 已提交
1367
	start_clock = sched_clock();
1368
	ret = x86_pmu.handle_irq(regs);
P
Peter Zijlstra 已提交
1369
	finish_clock = sched_clock();
1370 1371 1372 1373

	perf_sample_event_took(finish_clock - start_clock);

	return ret;
I
Ingo Molnar 已提交
1374
}
1375
NOKPROBE_SYMBOL(perf_event_nmi_handler);
I
Ingo Molnar 已提交
1376

1377 1378
struct event_constraint emptyconstraint;
struct event_constraint unconstrained;
1379

1380
static int
1381 1382 1383
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;
1384
	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1385
	int i, ret = NOTIFY_OK;
1386 1387 1388

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
1389 1390
		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
			cpuc->kfree_on_online[i] = NULL;
1391
		if (x86_pmu.cpu_prepare)
1392
			ret = x86_pmu.cpu_prepare(cpu);
1393 1394 1395 1396 1397 1398 1399
		break;

	case CPU_STARTING:
		if (x86_pmu.cpu_starting)
			x86_pmu.cpu_starting(cpu);
		break;

1400
	case CPU_ONLINE:
1401 1402 1403 1404
		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
			kfree(cpuc->kfree_on_online[i]);
			cpuc->kfree_on_online[i] = NULL;
		}
1405 1406
		break;

1407 1408 1409 1410 1411
	case CPU_DYING:
		if (x86_pmu.cpu_dying)
			x86_pmu.cpu_dying(cpu);
		break;

1412
	case CPU_UP_CANCELED:
1413 1414 1415 1416 1417 1418 1419 1420 1421
	case CPU_DEAD:
		if (x86_pmu.cpu_dead)
			x86_pmu.cpu_dead(cpu);
		break;

	default:
		break;
	}

1422
	return ret;
1423 1424
}

1425 1426 1427 1428 1429 1430 1431 1432
static void __init pmu_check_apic(void)
{
	if (cpu_has_apic)
		return;

	x86_pmu.apic = 0;
	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
	pr_info("no hardware sampling interrupt available.\n");
1433 1434 1435 1436 1437 1438 1439 1440 1441

	/*
	 * If we have a PMU initialized but no APIC
	 * interrupts, we cannot sample hardware
	 * events (user-space has to fall back and
	 * sample via a hrtimer based software event):
	 */
	pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;

1442 1443
}

1444 1445 1446 1447 1448
static struct attribute_group x86_pmu_format_group = {
	.name = "format",
	.attrs = NULL,
};

1449 1450 1451 1452 1453 1454
/*
 * Remove all undefined events (x86_pmu.event_map(id) == 0)
 * out of events_attr attributes.
 */
static void __init filter_events(struct attribute **attrs)
{
1455 1456
	struct device_attribute *d;
	struct perf_pmu_events_attr *pmu_attr;
1457 1458 1459
	int i, j;

	for (i = 0; attrs[i]; i++) {
1460 1461 1462 1463 1464
		d = (struct device_attribute *)attrs[i];
		pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
		/* str trumps id */
		if (pmu_attr->event_str)
			continue;
1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475
		if (x86_pmu.event_map(i))
			continue;

		for (j = i; attrs[j]; j++)
			attrs[j] = attrs[j + 1];

		/* Check the shifted attr. */
		i--;
	}
}

1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501
/* Merge two pointer arrays */
static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
{
	struct attribute **new;
	int j, i;

	for (j = 0; a[j]; j++)
		;
	for (i = 0; b[i]; i++)
		j++;
	j++;

	new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
	if (!new)
		return NULL;

	j = 0;
	for (i = 0; a[i]; i++)
		new[j++] = a[i];
	for (i = 0; b[i]; i++)
		new[j++] = b[i];
	new[j] = NULL;

	return new;
}

1502
ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
1503 1504 1505 1506 1507 1508
			  char *page)
{
	struct perf_pmu_events_attr *pmu_attr = \
		container_of(attr, struct perf_pmu_events_attr, attr);
	u64 config = x86_pmu.event_map(pmu_attr->id);

1509 1510 1511
	/* string trumps id */
	if (pmu_attr->event_str)
		return sprintf(page, "%s", pmu_attr->event_str);
1512

1513 1514
	return x86_pmu.events_sysfs_show(page, config);
}
1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528

EVENT_ATTR(cpu-cycles,			CPU_CYCLES		);
EVENT_ATTR(instructions,		INSTRUCTIONS		);
EVENT_ATTR(cache-references,		CACHE_REFERENCES	);
EVENT_ATTR(cache-misses, 		CACHE_MISSES		);
EVENT_ATTR(branch-instructions,		BRANCH_INSTRUCTIONS	);
EVENT_ATTR(branch-misses,		BRANCH_MISSES		);
EVENT_ATTR(bus-cycles,			BUS_CYCLES		);
EVENT_ATTR(stalled-cycles-frontend,	STALLED_CYCLES_FRONTEND	);
EVENT_ATTR(stalled-cycles-backend,	STALLED_CYCLES_BACKEND	);
EVENT_ATTR(ref-cycles,			REF_CPU_CYCLES		);

static struct attribute *empty_attrs;

P
Peter Huewe 已提交
1529
static struct attribute *events_attr[] = {
1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547
	EVENT_PTR(CPU_CYCLES),
	EVENT_PTR(INSTRUCTIONS),
	EVENT_PTR(CACHE_REFERENCES),
	EVENT_PTR(CACHE_MISSES),
	EVENT_PTR(BRANCH_INSTRUCTIONS),
	EVENT_PTR(BRANCH_MISSES),
	EVENT_PTR(BUS_CYCLES),
	EVENT_PTR(STALLED_CYCLES_FRONTEND),
	EVENT_PTR(STALLED_CYCLES_BACKEND),
	EVENT_PTR(REF_CPU_CYCLES),
	NULL,
};

static struct attribute_group x86_pmu_events_group = {
	.name = "events",
	.attrs = events_attr,
};

1548
ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586
{
	u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
	u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
	bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
	bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
	bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
	bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
	ssize_t ret;

	/*
	* We have whole page size to spend and just little data
	* to write, so we can safely use sprintf.
	*/
	ret = sprintf(page, "event=0x%02llx", event);

	if (umask)
		ret += sprintf(page + ret, ",umask=0x%02llx", umask);

	if (edge)
		ret += sprintf(page + ret, ",edge");

	if (pc)
		ret += sprintf(page + ret, ",pc");

	if (any)
		ret += sprintf(page + ret, ",any");

	if (inv)
		ret += sprintf(page + ret, ",inv");

	if (cmask)
		ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);

	ret += sprintf(page + ret, "\n");

	return ret;
}

1587
static int __init init_hw_perf_events(void)
1588
{
1589
	struct x86_pmu_quirk *quirk;
1590 1591
	int err;

1592
	pr_info("Performance Events: ");
1593

1594 1595
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1596
		err = intel_pmu_init();
1597
		break;
1598
	case X86_VENDOR_AMD:
1599
		err = amd_pmu_init();
1600
		break;
1601
	default:
1602
		err = -ENOTSUPP;
1603
	}
1604
	if (err != 0) {
1605
		pr_cont("no PMU driver, software events only.\n");
1606
		return 0;
1607
	}
1608

1609 1610
	pmu_check_apic();

1611
	/* sanity check that the hardware exists or is emulated */
1612
	if (!check_hw_exists())
1613
		return 0;
1614

1615
	pr_cont("%s PMU driver.\n", x86_pmu.name);
1616

1617 1618
	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */

1619 1620
	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
		quirk->func();
1621

1622 1623
	if (!x86_pmu.intel_ctrl)
		x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
I
Ingo Molnar 已提交
1624

1625
	perf_events_lapic_init();
1626
	register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
1627

1628
	unconstrained = (struct event_constraint)
1629
		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1630
				   0, x86_pmu.num_counters, 0, 0);
1631

1632
	x86_pmu_format_group.attrs = x86_pmu.format_attrs;
1633

1634 1635 1636
	if (x86_pmu.event_attrs)
		x86_pmu_events_group.attrs = x86_pmu.event_attrs;

1637 1638
	if (!x86_pmu.events_sysfs_show)
		x86_pmu_events_group.attrs = &empty_attrs;
1639 1640
	else
		filter_events(x86_pmu_events_group.attrs);
1641

1642 1643 1644 1645 1646 1647 1648 1649
	if (x86_pmu.cpu_events) {
		struct attribute **tmp;

		tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
		if (!WARN_ON(!tmp))
			x86_pmu_events_group.attrs = tmp;
	}

I
Ingo Molnar 已提交
1650
	pr_info("... version:                %d\n",     x86_pmu.version);
1651 1652 1653
	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
I
Ingo Molnar 已提交
1654
	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1655
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1656
	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1657

P
Peter Zijlstra 已提交
1658
	perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1659
	perf_cpu_notifier(x86_pmu_notifier);
1660 1661

	return 0;
I
Ingo Molnar 已提交
1662
}
1663
early_initcall(init_hw_perf_events);
I
Ingo Molnar 已提交
1664

1665
static inline void x86_pmu_read(struct perf_event *event)
1666
{
1667
	x86_perf_event_update(event);
1668 1669
}

1670 1671 1672 1673 1674
/*
 * Start group events scheduling transaction
 * Set the flag to make pmu::enable() not perform the
 * schedulability test, it will be performed at commit time
 */
P
Peter Zijlstra 已提交
1675
static void x86_pmu_start_txn(struct pmu *pmu)
1676
{
P
Peter Zijlstra 已提交
1677
	perf_pmu_disable(pmu);
T
Tejun Heo 已提交
1678 1679
	__this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
	__this_cpu_write(cpu_hw_events.n_txn, 0);
1680 1681 1682 1683 1684 1685 1686
}

/*
 * Stop group events scheduling transaction
 * Clear the flag and pmu::enable() will perform the
 * schedulability test.
 */
P
Peter Zijlstra 已提交
1687
static void x86_pmu_cancel_txn(struct pmu *pmu)
1688
{
T
Tejun Heo 已提交
1689
	__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1690
	/*
1691 1692
	 * Truncate collected array by the number of events added in this
	 * transaction. See x86_pmu_add() and x86_pmu_*_txn().
1693
	 */
T
Tejun Heo 已提交
1694 1695
	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
P
Peter Zijlstra 已提交
1696
	perf_pmu_enable(pmu);
1697 1698 1699 1700 1701 1702
}

/*
 * Commit group events scheduling transaction
 * Perform the group schedulability test as a whole
 * Return 0 if success
1703 1704
 *
 * Does not cancel the transaction on failure; expects the caller to do this.
1705
 */
P
Peter Zijlstra 已提交
1706
static int x86_pmu_commit_txn(struct pmu *pmu)
1707
{
1708
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726
	int assign[X86_PMC_IDX_MAX];
	int n, ret;

	n = cpuc->n_events;

	if (!x86_pmu_initialized())
		return -EAGAIN;

	ret = x86_pmu.schedule_events(cpuc, n, assign);
	if (ret)
		return ret;

	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));

1727
	cpuc->group_flag &= ~PERF_EVENT_TXN;
P
Peter Zijlstra 已提交
1728
	perf_pmu_enable(pmu);
1729 1730
	return 0;
}
1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759
/*
 * a fake_cpuc is used to validate event groups. Due to
 * the extra reg logic, we need to also allocate a fake
 * per_core and per_cpu structure. Otherwise, group events
 * using extra reg may conflict without the kernel being
 * able to catch this when the last event gets added to
 * the group.
 */
static void free_fake_cpuc(struct cpu_hw_events *cpuc)
{
	kfree(cpuc->shared_regs);
	kfree(cpuc);
}

static struct cpu_hw_events *allocate_fake_cpuc(void)
{
	struct cpu_hw_events *cpuc;
	int cpu = raw_smp_processor_id();

	cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
	if (!cpuc)
		return ERR_PTR(-ENOMEM);

	/* only needed, if we have extra_regs */
	if (x86_pmu.extra_regs) {
		cpuc->shared_regs = allocate_shared_regs(cpu);
		if (!cpuc->shared_regs)
			goto error;
	}
1760
	cpuc->is_fake = 1;
1761 1762 1763 1764 1765
	return cpuc;
error:
	free_fake_cpuc(cpuc);
	return ERR_PTR(-ENOMEM);
}
1766

1767 1768 1769 1770 1771 1772 1773 1774 1775
/*
 * validate that we can schedule this event
 */
static int validate_event(struct perf_event *event)
{
	struct cpu_hw_events *fake_cpuc;
	struct event_constraint *c;
	int ret = 0;

1776 1777 1778
	fake_cpuc = allocate_fake_cpuc();
	if (IS_ERR(fake_cpuc))
		return PTR_ERR(fake_cpuc);
1779

1780
	c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
1781 1782

	if (!c || !c->weight)
1783
		ret = -EINVAL;
1784 1785 1786 1787

	if (x86_pmu.put_event_constraints)
		x86_pmu.put_event_constraints(fake_cpuc, event);

1788
	free_fake_cpuc(fake_cpuc);
1789 1790 1791 1792

	return ret;
}

1793 1794 1795 1796
/*
 * validate a single event group
 *
 * validation include:
1797 1798 1799
 *	- check events are compatible which each other
 *	- events do not compete for the same counter
 *	- number of events <= number of counters
1800 1801 1802 1803
 *
 * validation ensures the group can be loaded onto the
 * PMU if it was the only group available.
 */
1804 1805
static int validate_group(struct perf_event *event)
{
1806
	struct perf_event *leader = event->group_leader;
1807
	struct cpu_hw_events *fake_cpuc;
1808
	int ret = -EINVAL, n;
1809

1810 1811 1812
	fake_cpuc = allocate_fake_cpuc();
	if (IS_ERR(fake_cpuc))
		return PTR_ERR(fake_cpuc);
1813 1814 1815 1816 1817 1818
	/*
	 * the event is not yet connected with its
	 * siblings therefore we must first collect
	 * existing siblings, then add the new event
	 * before we can simulate the scheduling
	 */
1819
	n = collect_events(fake_cpuc, leader, true);
1820
	if (n < 0)
1821
		goto out;
1822

1823 1824
	fake_cpuc->n_events = n;
	n = collect_events(fake_cpuc, event, false);
1825
	if (n < 0)
1826
		goto out;
1827

1828
	fake_cpuc->n_events = n;
1829

1830
	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1831 1832

out:
1833
	free_fake_cpuc(fake_cpuc);
1834
	return ret;
1835 1836
}

1837
static int x86_pmu_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
1838
{
P
Peter Zijlstra 已提交
1839
	struct pmu *tmp;
I
Ingo Molnar 已提交
1840 1841
	int err;

1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852
	switch (event->attr.type) {
	case PERF_TYPE_RAW:
	case PERF_TYPE_HARDWARE:
	case PERF_TYPE_HW_CACHE:
		break;

	default:
		return -ENOENT;
	}

	err = __x86_pmu_event_init(event);
1853
	if (!err) {
1854 1855 1856 1857 1858 1859 1860 1861
		/*
		 * we temporarily connect event to its pmu
		 * such that validate_group() can classify
		 * it as an x86 event using is_x86_event()
		 */
		tmp = event->pmu;
		event->pmu = &pmu;

1862 1863
		if (event->group_leader != event)
			err = validate_group(event);
1864 1865
		else
			err = validate_event(event);
1866 1867

		event->pmu = tmp;
1868
	}
1869
	if (err) {
1870 1871
		if (event->destroy)
			event->destroy(event);
1872
	}
I
Ingo Molnar 已提交
1873

1874 1875 1876
	if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
		event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;

1877
	return err;
I
Ingo Molnar 已提交
1878
}
1879

1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906
static void refresh_pce(void *ignored)
{
	if (current->mm)
		load_mm_cr4(current->mm);
}

static void x86_pmu_event_mapped(struct perf_event *event)
{
	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
		return;

	if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
}

static void x86_pmu_event_unmapped(struct perf_event *event)
{
	if (!current->mm)
		return;

	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
		return;

	if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
}

1907 1908 1909 1910
static int x86_pmu_event_idx(struct perf_event *event)
{
	int idx = event->hw.idx;

1911
	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
1912 1913
		return 0;

1914 1915
	if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
		idx -= INTEL_PMC_IDX_FIXED;
1916 1917 1918 1919 1920 1921
		idx |= 1 << 30;
	}

	return idx + 1;
}

1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932
static ssize_t get_attr_rdpmc(struct device *cdev,
			      struct device_attribute *attr,
			      char *buf)
{
	return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
}

static ssize_t set_attr_rdpmc(struct device *cdev,
			      struct device_attribute *attr,
			      const char *buf, size_t count)
{
1933 1934 1935 1936 1937 1938
	unsigned long val;
	ssize_t ret;

	ret = kstrtoul(buf, 0, &val);
	if (ret)
		return ret;
1939

1940 1941 1942
	if (val > 2)
		return -EINVAL;

1943 1944
	if (x86_pmu.attr_rdpmc_broken)
		return -ENOTSUPP;
1945

1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960
	if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
		/*
		 * Changing into or out of always available, aka
		 * perf-event-bypassing mode.  This path is extremely slow,
		 * but only root can trigger it, so it's okay.
		 */
		if (val == 2)
			static_key_slow_inc(&rdpmc_always_available);
		else
			static_key_slow_dec(&rdpmc_always_available);
		on_each_cpu(refresh_pce, NULL, 1);
	}

	x86_pmu.attr_rdpmc = val;

1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976
	return count;
}

static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);

static struct attribute *x86_pmu_attrs[] = {
	&dev_attr_rdpmc.attr,
	NULL,
};

static struct attribute_group x86_pmu_attr_group = {
	.attrs = x86_pmu_attrs,
};

static const struct attribute_group *x86_pmu_attr_groups[] = {
	&x86_pmu_attr_group,
1977
	&x86_pmu_format_group,
1978
	&x86_pmu_events_group,
1979 1980 1981
	NULL,
};

1982 1983 1984 1985 1986 1987
static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
{
	if (x86_pmu.sched_task)
		x86_pmu.sched_task(ctx, sched_in);
}

1988 1989 1990 1991 1992 1993 1994
void perf_check_microcode(void)
{
	if (x86_pmu.check_microcode)
		x86_pmu.check_microcode();
}
EXPORT_SYMBOL_GPL(perf_check_microcode);

1995
static struct pmu pmu = {
1996 1997
	.pmu_enable		= x86_pmu_enable,
	.pmu_disable		= x86_pmu_disable,
P
Peter Zijlstra 已提交
1998

1999
	.attr_groups		= x86_pmu_attr_groups,
2000

2001
	.event_init		= x86_pmu_event_init,
P
Peter Zijlstra 已提交
2002

2003 2004 2005
	.event_mapped		= x86_pmu_event_mapped,
	.event_unmapped		= x86_pmu_event_unmapped,

2006 2007 2008 2009 2010
	.add			= x86_pmu_add,
	.del			= x86_pmu_del,
	.start			= x86_pmu_start,
	.stop			= x86_pmu_stop,
	.read			= x86_pmu_read,
P
Peter Zijlstra 已提交
2011

2012 2013 2014
	.start_txn		= x86_pmu_start_txn,
	.cancel_txn		= x86_pmu_cancel_txn,
	.commit_txn		= x86_pmu_commit_txn,
2015

2016
	.event_idx		= x86_pmu_event_idx,
2017
	.sched_task		= x86_pmu_sched_task,
2018
	.task_ctx_size          = sizeof(struct x86_perf_task_context),
2019 2020
};

2021 2022
void arch_perf_update_userpage(struct perf_event *event,
			       struct perf_event_mmap_page *userpg, u64 now)
2023
{
2024 2025
	struct cyc2ns_data *data;

2026 2027
	userpg->cap_user_time = 0;
	userpg->cap_user_time_zero = 0;
2028 2029
	userpg->cap_user_rdpmc =
		!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
2030 2031
	userpg->pmc_width = x86_pmu.cntval_bits;

2032
	if (!sched_clock_stable())
2033 2034
		return;

2035 2036
	data = cyc2ns_read_begin();

2037 2038 2039 2040
	/*
	 * Internal timekeeping for enabled/running/stopped times
	 * is always in the local_clock domain.
	 */
2041
	userpg->cap_user_time = 1;
2042 2043 2044
	userpg->time_mult = data->cyc2ns_mul;
	userpg->time_shift = data->cyc2ns_shift;
	userpg->time_offset = data->cyc2ns_offset - now;
2045

2046 2047 2048 2049 2050 2051 2052 2053
	/*
	 * cap_user_time_zero doesn't make sense when we're using a different
	 * time base for the records.
	 */
	if (event->clock == &local_clock) {
		userpg->cap_user_time_zero = 1;
		userpg->time_zero = data->cyc2ns_offset;
	}
2054 2055

	cyc2ns_read_end(data);
2056 2057
}

2058 2059 2060 2061 2062 2063
/*
 * callchain support
 */

static int backtrace_stack(void *data, char *name)
{
2064
	return 0;
2065 2066 2067 2068 2069 2070
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

2071
	perf_callchain_store(entry, addr);
2072 2073 2074 2075 2076
}

static const struct stacktrace_ops backtrace_ops = {
	.stack			= backtrace_stack,
	.address		= backtrace_address,
2077
	.walk_stack		= print_context_stack_bp,
2078 2079
};

2080 2081
void
perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
2082
{
2083 2084
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
2085
		return;
2086 2087
	}

2088
	perf_callchain_store(entry, regs->ip);
2089

2090
	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
2091 2092
}

2093 2094 2095 2096 2097 2098
static inline int
valid_user_frame(const void __user *fp, unsigned long size)
{
	return (__range_not_ok(fp, size, TASK_SIZE) == 0);
}

2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115
static unsigned long get_segment_base(unsigned int segment)
{
	struct desc_struct *desc;
	int idx = segment >> 3;

	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
		if (idx > LDT_ENTRIES)
			return 0;

		if (idx > current->active_mm->context.size)
			return 0;

		desc = current->active_mm->context.ldt;
	} else {
		if (idx > GDT_ENTRIES)
			return 0;

2116
		desc = raw_cpu_ptr(gdt_page.gdt);
2117 2118 2119 2120 2121
	}

	return get_desc_base(desc + idx);
}

2122
#ifdef CONFIG_COMPAT
H
H. Peter Anvin 已提交
2123 2124 2125

#include <asm/compat.h>

2126 2127
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
2128
{
2129
	/* 32-bit process in 64-bit kernel. */
2130
	unsigned long ss_base, cs_base;
2131 2132
	struct stack_frame_ia32 frame;
	const void __user *fp;
2133

2134 2135 2136
	if (!test_thread_flag(TIF_IA32))
		return 0;

2137 2138 2139 2140
	cs_base = get_segment_base(regs->cs);
	ss_base = get_segment_base(regs->ss);

	fp = compat_ptr(ss_base + regs->bp);
2141 2142 2143 2144 2145 2146
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
		unsigned long bytes;
		frame.next_frame     = 0;
		frame.return_address = 0;

		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
2147
		if (bytes != 0)
2148
			break;
2149

2150 2151 2152
		if (!valid_user_frame(fp, sizeof(frame)))
			break;

2153 2154
		perf_callchain_store(entry, cs_base + frame.return_address);
		fp = compat_ptr(ss_base + frame.next_frame);
2155 2156
	}
	return 1;
2157
}
2158 2159 2160 2161 2162 2163 2164
#else
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
    return 0;
}
#endif
2165

2166 2167
void
perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
2168 2169 2170 2171
{
	struct stack_frame frame;
	const void __user *fp;

2172 2173
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
2174
		return;
2175
	}
2176

2177 2178 2179 2180 2181 2182
	/*
	 * We don't know what to do with VM86 stacks.. ignore them for now.
	 */
	if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
		return;

2183
	fp = (void __user *)regs->bp;
2184

2185
	perf_callchain_store(entry, regs->ip);
2186

2187 2188 2189
	if (!current->mm)
		return;

2190 2191 2192
	if (perf_callchain_user32(regs, entry))
		return;

2193
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
2194
		unsigned long bytes;
2195
		frame.next_frame	     = NULL;
2196 2197
		frame.return_address = 0;

2198
		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
2199
		if (bytes != 0)
2200 2201
			break;

2202 2203 2204
		if (!valid_user_frame(fp, sizeof(frame)))
			break;

2205
		perf_callchain_store(entry, frame.return_address);
2206
		fp = frame.next_frame;
2207 2208 2209
	}
}

2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223
/*
 * Deal with code segment offsets for the various execution modes:
 *
 *   VM86 - the good olde 16 bit days, where the linear address is
 *          20 bits and we use regs->ip + 0x10 * regs->cs.
 *
 *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
 *          to figure out what the 32bit base address is.
 *
 *    X32 - has TIF_X32 set, but is running in x86_64
 *
 * X86_64 - CS,DS,SS,ES are all zero based.
 */
static unsigned long code_segment_base(struct pt_regs *regs)
2224
{
2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246
	/*
	 * If we are in VM86 mode, add the segment offset to convert to a
	 * linear address.
	 */
	if (regs->flags & X86_VM_MASK)
		return 0x10 * regs->cs;

	/*
	 * For IA32 we look at the GDT/LDT segment base to convert the
	 * effective IP to a linear address.
	 */
#ifdef CONFIG_X86_32
	if (user_mode(regs) && regs->cs != __USER_CS)
		return get_segment_base(regs->cs);
#else
	if (test_thread_flag(TIF_IA32)) {
		if (user_mode(regs) && regs->cs != __USER32_CS)
			return get_segment_base(regs->cs);
	}
#endif
	return 0;
}
2247

2248 2249
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
2250
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
2251
		return perf_guest_cbs->get_guest_ip();
2252

2253
	return regs->ip + code_segment_base(regs);
2254 2255 2256 2257 2258
}

unsigned long perf_misc_flags(struct pt_regs *regs)
{
	int misc = 0;
2259

2260
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
2261 2262 2263 2264 2265
		if (perf_guest_cbs->is_user_mode())
			misc |= PERF_RECORD_MISC_GUEST_USER;
		else
			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
	} else {
2266
		if (user_mode(regs))
2267 2268 2269 2270 2271
			misc |= PERF_RECORD_MISC_USER;
		else
			misc |= PERF_RECORD_MISC_KERNEL;
	}

2272
	if (regs->flags & PERF_EFLAGS_EXACT)
P
Peter Zijlstra 已提交
2273
		misc |= PERF_RECORD_MISC_EXACT_IP;
2274 2275 2276

	return misc;
}
2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288

void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
{
	cap->version		= x86_pmu.version;
	cap->num_counters_gp	= x86_pmu.num_counters;
	cap->num_counters_fixed	= x86_pmu.num_counters_fixed;
	cap->bit_width_gp	= x86_pmu.cntval_bits;
	cap->bit_width_fixed	= x86_pmu.cntval_bits;
	cap->events_mask	= (unsigned int)x86_pmu.events_maskl;
	cap->events_mask_len	= x86_pmu.events_mask_len;
}
EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);