uncore.c 31.4 KB
Newer Older
1
#include "uncore.h"
2 3

static struct intel_uncore_type *empty_uncore[] = { NULL, };
4 5
struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
6

7 8 9
static bool pcidrv_registered;
struct pci_driver *uncore_pci_driver;
/* pci bus to socket mapping */
10 11
DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
12 13
struct pci_extra_dev *uncore_extra_pci_dev;
static int max_packages;
14

15 16 17 18
/* mask of cpus that collect uncore events */
static cpumask_t uncore_cpu_mask;

/* constraint for the fixed counter */
19
static struct event_constraint uncore_constraint_fixed =
20
	EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
21
struct event_constraint uncore_constraint_empty =
22
	EVENT_CONSTRAINT(0, 0, 0);
23

24
static int uncore_pcibus_to_physid(struct pci_bus *bus)
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
{
	struct pci2phy_map *map;
	int phys_id = -1;

	raw_spin_lock(&pci2phy_map_lock);
	list_for_each_entry(map, &pci2phy_map_head, list) {
		if (map->segment == pci_domain_nr(bus)) {
			phys_id = map->pbus_to_physid[bus->number];
			break;
		}
	}
	raw_spin_unlock(&pci2phy_map_lock);

	return phys_id;
}

41 42 43 44 45 46 47 48 49 50
static void uncore_free_pcibus_map(void)
{
	struct pci2phy_map *map, *tmp;

	list_for_each_entry_safe(map, tmp, &pci2phy_map_head, list) {
		list_del(&map->list);
		kfree(map);
	}
}

51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
struct pci2phy_map *__find_pci2phy_map(int segment)
{
	struct pci2phy_map *map, *alloc = NULL;
	int i;

	lockdep_assert_held(&pci2phy_map_lock);

lookup:
	list_for_each_entry(map, &pci2phy_map_head, list) {
		if (map->segment == segment)
			goto end;
	}

	if (!alloc) {
		raw_spin_unlock(&pci2phy_map_lock);
		alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
		raw_spin_lock(&pci2phy_map_lock);

		if (!alloc)
			return NULL;

		goto lookup;
	}

	map = alloc;
	alloc = NULL;
	map->segment = segment;
	for (i = 0; i < 256; i++)
		map->pbus_to_physid[i] = -1;
	list_add_tail(&map->list, &pci2phy_map_head);

end:
	kfree(alloc);
	return map;
}

87 88 89 90 91 92 93 94 95
ssize_t uncore_event_show(struct kobject *kobj,
			  struct kobj_attribute *attr, char *buf)
{
	struct uncore_event_desc *event =
		container_of(attr, struct uncore_event_desc, attr);
	return sprintf(buf, "%s", event->config);
}

struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
96
{
97
	return pmu->boxes[topology_logical_package_id(cpu)];
98 99
}

100
u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
101 102 103 104 105 106 107 108 109 110 111
{
	u64 count;

	rdmsrl(event->hw.event_base, count);

	return count;
}

/*
 * generic get constraint function for shared match/mask registers.
 */
112
struct event_constraint *
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
{
	struct intel_uncore_extra_reg *er;
	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
	unsigned long flags;
	bool ok = false;

	/*
	 * reg->alloc can be set due to existing state, so for fake box we
	 * need to ignore this, otherwise we might fail to allocate proper
	 * fake state for this extra reg constraint.
	 */
	if (reg1->idx == EXTRA_REG_NONE ||
	    (!uncore_box_is_fake(box) && reg1->alloc))
		return NULL;

	er = &box->shared_regs[reg1->idx];
	raw_spin_lock_irqsave(&er->lock, flags);
	if (!atomic_read(&er->ref) ||
	    (er->config1 == reg1->config && er->config2 == reg2->config)) {
		atomic_inc(&er->ref);
		er->config1 = reg1->config;
		er->config2 = reg2->config;
		ok = true;
	}
	raw_spin_unlock_irqrestore(&er->lock, flags);

	if (ok) {
		if (!uncore_box_is_fake(box))
			reg1->alloc = 1;
		return NULL;
	}

147
	return &uncore_constraint_empty;
148 149
}

150
void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
{
	struct intel_uncore_extra_reg *er;
	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;

	/*
	 * Only put constraint if extra reg was actually allocated. Also
	 * takes care of event which do not use an extra shared reg.
	 *
	 * Also, if this is a fake box we shouldn't touch any event state
	 * (reg->alloc) and we don't care about leaving inconsistent box
	 * state either since it will be thrown out.
	 */
	if (uncore_box_is_fake(box) || !reg1->alloc)
		return;

	er = &box->shared_regs[reg1->idx];
	atomic_dec(&er->ref);
	reg1->alloc = 0;
}

171
u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
172 173 174 175 176 177 178 179 180 181 182 183 184 185
{
	struct intel_uncore_extra_reg *er;
	unsigned long flags;
	u64 config;

	er = &box->shared_regs[idx];

	raw_spin_lock_irqsave(&er->lock, flags);
	config = er->config;
	raw_spin_unlock_irqrestore(&er->lock, flags);

	return config;
}

186 187
static void uncore_assign_hw_event(struct intel_uncore_box *box,
				   struct perf_event *event, int idx)
188 189 190 191 192 193 194
{
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = idx;
	hwc->last_tag = ++box->tags[idx];

	if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
195 196
		hwc->event_base = uncore_fixed_ctr(box);
		hwc->config_base = uncore_fixed_ctl(box);
197 198 199
		return;
	}

200 201
	hwc->config_base = uncore_event_ctl(box, hwc->idx);
	hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
202 203
}

204
void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
{
	u64 prev_count, new_count, delta;
	int shift;

	if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
		shift = 64 - uncore_fixed_ctr_bits(box);
	else
		shift = 64 - uncore_perf_ctr_bits(box);

	/* the hrtimer might modify the previous event value */
again:
	prev_count = local64_read(&event->hw.prev_count);
	new_count = uncore_read_counter(box, event);
	if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
		goto again;

	delta = (new_count << shift) - (prev_count << shift);
	delta >>= shift;

	local64_add(delta, &event->count);
}

/*
 * The overflow interrupt is unavailable for SandyBridge-EP, is broken
 * for SandyBridge. So we use hrtimer to periodically poll the counter
 * to avoid overflow.
 */
static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
{
	struct intel_uncore_box *box;
235
	struct perf_event *event;
236 237 238 239 240 241 242 243 244 245 246 247
	unsigned long flags;
	int bit;

	box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
	if (!box->n_active || box->cpu != smp_processor_id())
		return HRTIMER_NORESTART;
	/*
	 * disable local interrupt to prevent uncore_pmu_event_start/stop
	 * to interrupt the update process
	 */
	local_irq_save(flags);

248 249 250 251 252 253 254 255
	/*
	 * handle boxes with an active event list as opposed to active
	 * counters
	 */
	list_for_each_entry(event, &box->active_list, active_entry) {
		uncore_perf_event_update(box, event);
	}

256 257 258 259 260
	for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
		uncore_perf_event_update(box, box->events[bit]);

	local_irq_restore(flags);

261
	hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
262 263 264
	return HRTIMER_RESTART;
}

265
void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
266
{
267 268
	hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
		      HRTIMER_MODE_REL_PINNED);
269 270
}

271
void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
272 273 274 275 276 277 278 279 280 281
{
	hrtimer_cancel(&box->hrtimer);
}

static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
{
	hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	box->hrtimer.function = uncore_pmu_hrtimer;
}

282 283
static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
						 int node)
284
{
285
	int i, size, numshared = type->num_shared_regs ;
286 287
	struct intel_uncore_box *box;

288
	size = sizeof(*box) + numshared * sizeof(struct intel_uncore_extra_reg);
289

290
	box = kzalloc_node(size, GFP_KERNEL, node);
291 292 293
	if (!box)
		return NULL;

294
	for (i = 0; i < numshared; i++)
295 296
		raw_spin_lock_init(&box->shared_regs[i].lock);

297 298
	uncore_pmu_init_hrtimer(box);
	box->cpu = -1;
299 300
	box->pci_phys_id = -1;
	box->pkgid = -1;
301

302 303
	/* set default hrtimer timeout */
	box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
304

305
	INIT_LIST_HEAD(&box->active_list);
306

307 308 309
	return box;
}

310 311 312 313 314 315 316 317 318 319 320
/*
 * Using uncore_pmu_event_init pmu event_init callback
 * as a detection point for uncore events.
 */
static int uncore_pmu_event_init(struct perf_event *event);

static bool is_uncore_event(struct perf_event *event)
{
	return event->pmu->event_init == uncore_pmu_event_init;
}

321
static int
322 323
uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader,
		      bool dogrp)
324 325 326 327 328 329 330 331 332 333 334 335
{
	struct perf_event *event;
	int n, max_count;

	max_count = box->pmu->type->num_counters;
	if (box->pmu->type->fixed_ctl)
		max_count++;

	if (box->n_events >= max_count)
		return -EINVAL;

	n = box->n_events;
336 337 338 339 340 341

	if (is_uncore_event(leader)) {
		box->event_list[n] = leader;
		n++;
	}

342 343 344 345
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
346 347
		if (!is_uncore_event(event) ||
		    event->state <= PERF_EVENT_STATE_OFF)
348 349 350 351 352 353 354 355 356 357 358 359
			continue;

		if (n >= max_count)
			return -EINVAL;

		box->event_list[n] = event;
		n++;
	}
	return n;
}

static struct event_constraint *
360
uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
361
{
362
	struct intel_uncore_type *type = box->pmu->type;
363 364
	struct event_constraint *c;

365 366 367 368 369 370
	if (type->ops->get_constraint) {
		c = type->ops->get_constraint(box, event);
		if (c)
			return c;
	}

371
	if (event->attr.config == UNCORE_FIXED_EVENT)
372
		return &uncore_constraint_fixed;
373 374 375 376 377 378 379 380 381 382 383

	if (type->constraints) {
		for_each_event_constraint(c, type->constraints) {
			if ((event->hw.config & c->cmask) == c->code)
				return c;
		}
	}

	return &type->unconstrainted;
}

384 385
static void uncore_put_event_constraint(struct intel_uncore_box *box,
					struct perf_event *event)
386 387 388 389 390
{
	if (box->pmu->type->ops->put_constraint)
		box->pmu->type->ops->put_constraint(box, event);
}

391
static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
392 393
{
	unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
394
	struct event_constraint *c;
395
	int i, wmin, wmax, ret = 0;
396 397 398 399 400
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);

	for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
401
		c = uncore_get_event_constraint(box, box->event_list[i]);
402
		box->event_constraint[i] = c;
403 404 405 406 407 408 409
		wmin = min(wmin, c->weight);
		wmax = max(wmax, c->weight);
	}

	/* fastpath, try to reuse previous register */
	for (i = 0; i < n; i++) {
		hwc = &box->event_list[i]->hw;
410
		c = box->event_constraint[i];
411 412 413 414 415 416 417 418 419 420 421 422 423 424

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
		if (!test_bit(hwc->idx, c->idxmsk))
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

		__set_bit(hwc->idx, used_mask);
425 426
		if (assign)
			assign[i] = hwc->idx;
427 428
	}
	/* slow path */
429
	if (i != n)
430
		ret = perf_assign_events(box->event_constraint, n,
431
					 wmin, wmax, n, assign);
432 433 434 435 436

	if (!assign || ret) {
		for (i = 0; i < n; i++)
			uncore_put_event_constraint(box, box->event_list[i]);
	}
437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562
	return ret ? -EINVAL : 0;
}

static void uncore_pmu_event_start(struct perf_event *event, int flags)
{
	struct intel_uncore_box *box = uncore_event_to_box(event);
	int idx = event->hw.idx;

	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
		return;

	if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
		return;

	event->hw.state = 0;
	box->events[idx] = event;
	box->n_active++;
	__set_bit(idx, box->active_mask);

	local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
	uncore_enable_event(box, event);

	if (box->n_active == 1) {
		uncore_enable_box(box);
		uncore_pmu_start_hrtimer(box);
	}
}

static void uncore_pmu_event_stop(struct perf_event *event, int flags)
{
	struct intel_uncore_box *box = uncore_event_to_box(event);
	struct hw_perf_event *hwc = &event->hw;

	if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
		uncore_disable_event(box, event);
		box->n_active--;
		box->events[hwc->idx] = NULL;
		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
		hwc->state |= PERF_HES_STOPPED;

		if (box->n_active == 0) {
			uncore_disable_box(box);
			uncore_pmu_cancel_hrtimer(box);
		}
	}

	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
		/*
		 * Drain the remaining delta count out of a event
		 * that we are disabling:
		 */
		uncore_perf_event_update(box, event);
		hwc->state |= PERF_HES_UPTODATE;
	}
}

static int uncore_pmu_event_add(struct perf_event *event, int flags)
{
	struct intel_uncore_box *box = uncore_event_to_box(event);
	struct hw_perf_event *hwc = &event->hw;
	int assign[UNCORE_PMC_IDX_MAX];
	int i, n, ret;

	if (!box)
		return -ENODEV;

	ret = n = uncore_collect_events(box, event, false);
	if (ret < 0)
		return ret;

	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
	if (!(flags & PERF_EF_START))
		hwc->state |= PERF_HES_ARCH;

	ret = uncore_assign_events(box, assign, n);
	if (ret)
		return ret;

	/* save events moving to new counters */
	for (i = 0; i < box->n_events; i++) {
		event = box->event_list[i];
		hwc = &event->hw;

		if (hwc->idx == assign[i] &&
			hwc->last_tag == box->tags[assign[i]])
			continue;
		/*
		 * Ensure we don't accidentally enable a stopped
		 * counter simply because we rescheduled.
		 */
		if (hwc->state & PERF_HES_STOPPED)
			hwc->state |= PERF_HES_ARCH;

		uncore_pmu_event_stop(event, PERF_EF_UPDATE);
	}

	/* reprogram moved events into new counters */
	for (i = 0; i < n; i++) {
		event = box->event_list[i];
		hwc = &event->hw;

		if (hwc->idx != assign[i] ||
			hwc->last_tag != box->tags[assign[i]])
			uncore_assign_hw_event(box, event, assign[i]);
		else if (i < box->n_events)
			continue;

		if (hwc->state & PERF_HES_ARCH)
			continue;

		uncore_pmu_event_start(event, 0);
	}
	box->n_events = n;

	return 0;
}

static void uncore_pmu_event_del(struct perf_event *event, int flags)
{
	struct intel_uncore_box *box = uncore_event_to_box(event);
	int i;

	uncore_pmu_event_stop(event, PERF_EF_UPDATE);

	for (i = 0; i < box->n_events; i++) {
		if (event == box->event_list[i]) {
563 564
			uncore_put_event_constraint(box, event);

565
			for (++i; i < box->n_events; i++)
566 567 568 569 570 571 572 573 574 575 576
				box->event_list[i - 1] = box->event_list[i];

			--box->n_events;
			break;
		}
	}

	event->hw.idx = -1;
	event->hw.last_tag = ~0ULL;
}

577
void uncore_pmu_event_read(struct perf_event *event)
578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593
{
	struct intel_uncore_box *box = uncore_event_to_box(event);
	uncore_perf_event_update(box, event);
}

/*
 * validation ensures the group can be loaded onto the
 * PMU if it was the only group available.
 */
static int uncore_validate_group(struct intel_uncore_pmu *pmu,
				struct perf_event *event)
{
	struct perf_event *leader = event->group_leader;
	struct intel_uncore_box *fake_box;
	int ret = -EINVAL, n;

594
	fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615
	if (!fake_box)
		return -ENOMEM;

	fake_box->pmu = pmu;
	/*
	 * the event is not yet connected with its
	 * siblings therefore we must first collect
	 * existing siblings, then add the new event
	 * before we can simulate the scheduling
	 */
	n = uncore_collect_events(fake_box, leader, true);
	if (n < 0)
		goto out;

	fake_box->n_events = n;
	n = uncore_collect_events(fake_box, event, false);
	if (n < 0)
		goto out;

	fake_box->n_events = n;

616
	ret = uncore_assign_events(fake_box, NULL, n);
617 618 619 620 621
out:
	kfree(fake_box);
	return ret;
}

622
static int uncore_pmu_event_init(struct perf_event *event)
623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658
{
	struct intel_uncore_pmu *pmu;
	struct intel_uncore_box *box;
	struct hw_perf_event *hwc = &event->hw;
	int ret;

	if (event->attr.type != event->pmu->type)
		return -ENOENT;

	pmu = uncore_event_to_pmu(event);
	/* no device found for this pmu */
	if (pmu->func_id < 0)
		return -ENOENT;

	/*
	 * Uncore PMU does measure at all privilege level all the time.
	 * So it doesn't make sense to specify any exclude bits.
	 */
	if (event->attr.exclude_user || event->attr.exclude_kernel ||
			event->attr.exclude_hv || event->attr.exclude_idle)
		return -EINVAL;

	/* Sampling not supported yet */
	if (hwc->sample_period)
		return -EINVAL;

	/*
	 * Place all uncore events for a particular physical package
	 * onto a single cpu
	 */
	if (event->cpu < 0)
		return -EINVAL;
	box = uncore_pmu_to_box(pmu, event->cpu);
	if (!box || box->cpu < 0)
		return -EINVAL;
	event->cpu = box->cpu;
659
	event->pmu_private = box;
660

661 662 663
	event->hw.idx = -1;
	event->hw.last_tag = ~0ULL;
	event->hw.extra_reg.idx = EXTRA_REG_NONE;
664
	event->hw.branch_reg.idx = EXTRA_REG_NONE;
665

666 667 668 669 670 671 672 673 674 675
	if (event->attr.config == UNCORE_FIXED_EVENT) {
		/* no fixed counter */
		if (!pmu->type->fixed_ctl)
			return -EINVAL;
		/*
		 * if there is only one fixed counter, only the first pmu
		 * can access the fixed counter
		 */
		if (pmu->type->single_fixed && pmu->pmu_idx > 0)
			return -EINVAL;
676 677 678

		/* fixed counters have event field hardcoded to zero */
		hwc->config = 0ULL;
679 680
	} else {
		hwc->config = event->attr.config & pmu->type->event_mask;
681 682 683 684 685
		if (pmu->type->ops->hw_config) {
			ret = pmu->type->ops->hw_config(box, event);
			if (ret)
				return ret;
		}
686 687 688 689 690 691 692 693 694 695
	}

	if (event->group_leader != event)
		ret = uncore_validate_group(pmu, event);
	else
		ret = 0;

	return ret;
}

696 697 698
static ssize_t uncore_get_attr_cpumask(struct device *dev,
				struct device_attribute *attr, char *buf)
{
699
	return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
700 701 702 703 704 705 706 707 708 709 710 711 712
}

static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);

static struct attribute *uncore_pmu_attrs[] = {
	&dev_attr_cpumask.attr,
	NULL,
};

static struct attribute_group uncore_pmu_attr_group = {
	.attrs = uncore_pmu_attrs,
};

713
static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
714 715 716
{
	int ret;

717 718 719 720 721 722 723 724 725 726 727 728 729 730 731
	if (!pmu->type->pmu) {
		pmu->pmu = (struct pmu) {
			.attr_groups	= pmu->type->attr_groups,
			.task_ctx_nr	= perf_invalid_context,
			.event_init	= uncore_pmu_event_init,
			.add		= uncore_pmu_event_add,
			.del		= uncore_pmu_event_del,
			.start		= uncore_pmu_event_start,
			.stop		= uncore_pmu_event_stop,
			.read		= uncore_pmu_event_read,
		};
	} else {
		pmu->pmu = *pmu->type->pmu;
		pmu->pmu.attr_groups = pmu->type->attr_groups;
	}
732 733 734 735 736 737 738 739 740 741 742 743

	if (pmu->type->num_boxes == 1) {
		if (strlen(pmu->type->name) > 0)
			sprintf(pmu->name, "uncore_%s", pmu->type->name);
		else
			sprintf(pmu->name, "uncore");
	} else {
		sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
			pmu->pmu_idx);
	}

	ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
744 745
	if (!ret)
		pmu->registered = true;
746 747 748
	return ret;
}

749 750 751 752 753 754 755 756
static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
{
	if (!pmu->registered)
		return;
	perf_pmu_unregister(&pmu->pmu);
	pmu->registered = false;
}

757 758 759 760 761 762 763 764 765
static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
{
	int pkg;

	for (pkg = 0; pkg < max_packages; pkg++)
		kfree(pmu->boxes[pkg]);
	kfree(pmu->boxes);
}

766 767
static void __init uncore_type_exit(struct intel_uncore_type *type)
{
768
	struct intel_uncore_pmu *pmu = type->pmus;
769 770
	int i;

771 772 773 774
	if (pmu) {
		for (i = 0; i < type->num_boxes; i++, pmu++) {
			uncore_pmu_unregister(pmu);
			uncore_free_boxes(pmu);
775
		}
776 777 778
		kfree(type->pmus);
		type->pmus = NULL;
	}
779 780
	kfree(type->events_group);
	type->events_group = NULL;
781 782
}

783
static void __init uncore_types_exit(struct intel_uncore_type **types)
784
{
785 786
	for (; *types; types++)
		uncore_type_exit(*types);
787 788
}

789
static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
790 791
{
	struct intel_uncore_pmu *pmus;
792
	struct attribute_group *attr_group;
793
	struct attribute **attrs;
794
	size_t size;
795 796 797 798 799 800
	int i, j;

	pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
	if (!pmus)
		return -ENOMEM;

801
	size = max_packages * sizeof(struct intel_uncore_box *);
802 803

	for (i = 0; i < type->num_boxes; i++) {
804 805 806 807 808
		pmus[i].func_id	= setid ? i : -1;
		pmus[i].pmu_idx	= i;
		pmus[i].type	= type;
		pmus[i].boxes	= kzalloc(size, GFP_KERNEL);
		if (!pmus[i].boxes)
809
			return -ENOMEM;
810 811
	}

812 813 814 815 816
	type->pmus = pmus;
	type->unconstrainted = (struct event_constraint)
		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
				0, type->num_counters, 0, 0);

817
	if (type->event_descs) {
818
		for (i = 0; type->event_descs[i].attr.attr.name; i++);
819

820 821 822
		attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
					sizeof(*attr_group), GFP_KERNEL);
		if (!attr_group)
823
			return -ENOMEM;
824

825 826 827
		attrs = (struct attribute **)(attr_group + 1);
		attr_group->name = "events";
		attr_group->attrs = attrs;
828 829 830 831

		for (j = 0; j < i; j++)
			attrs[j] = &type->event_descs[j].attr.attr;

832
		type->events_group = attr_group;
833 834
	}

835
	type->pmu_group = &uncore_pmu_attr_group;
836 837 838
	return 0;
}

839 840
static int __init
uncore_types_init(struct intel_uncore_type **types, bool setid)
841
{
842
	int ret;
843

844 845
	for (; *types; types++) {
		ret = uncore_type_init(*types, setid);
846
		if (ret)
847
			return ret;
848 849 850 851
	}
	return 0;
}

852 853 854
/*
 * add a pci uncore device
 */
855
static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
856
{
857
	struct intel_uncore_type *type;
858 859
	struct intel_uncore_pmu *pmu;
	struct intel_uncore_box *box;
860
	int phys_id, pkg, ret;
861

862
	phys_id = uncore_pcibus_to_physid(pdev->bus);
863
	if (phys_id < 0)
864 865
		return -ENODEV;

866 867 868 869
	pkg = topology_phys_to_logical_pkg(phys_id);
	if (WARN_ON_ONCE(pkg < 0))
		return -EINVAL;

870
	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
871
		int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
872 873

		uncore_extra_pci_dev[pkg].dev[idx] = pdev;
874 875 876 877
		pci_set_drvdata(pdev, NULL);
		return 0;
	}

878
	type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
879 880 881 882
	/*
	 * for performance monitoring unit with multiple boxes,
	 * each box has a different function id.
	 */
883
	pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
884 885 886 887 888 889
	/* Knights Landing uses a common PCI device ID for multiple instances of
	 * an uncore PMU device type. There is only one entry per device type in
	 * the knl_uncore_pci_ids table inspite of multiple devices present for
	 * some device types. Hence PCI device idx would be 0 for all devices.
	 * So increment pmu pointer to point to an unused array element.
	 */
890
	if (boot_cpu_data.x86_model == 87) {
891 892
		while (pmu->func_id >= 0)
			pmu++;
893 894
	}

895 896 897 898 899 900 901
	if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL))
		return -EINVAL;

	box = uncore_alloc_box(type, NUMA_NO_NODE);
	if (!box)
		return -ENOMEM;

902 903 904 905
	if (pmu->func_id < 0)
		pmu->func_id = pdev->devfn;
	else
		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
906

907 908 909
	atomic_inc(&box->refcnt);
	box->pci_phys_id = phys_id;
	box->pkgid = pkg;
910 911
	box->pci_dev = pdev;
	box->pmu = pmu;
912
	uncore_box_init(box);
913 914
	pci_set_drvdata(pdev, box);

915 916
	pmu->boxes[pkg] = box;
	if (atomic_inc_return(&pmu->activeboxes) > 1)
917 918
		return 0;

919
	/* First active box registers the pmu */
920 921 922
	ret = uncore_pmu_register(pmu);
	if (ret) {
		pci_set_drvdata(pdev, NULL);
923
		pmu->boxes[pkg] = NULL;
924
		uncore_box_exit(box);
925 926 927
		kfree(box);
	}
	return ret;
928 929
}

930
static void uncore_pci_remove(struct pci_dev *pdev)
931 932
{
	struct intel_uncore_box *box = pci_get_drvdata(pdev);
933
	struct intel_uncore_pmu *pmu;
934
	int i, phys_id, pkg;
935

936
	phys_id = uncore_pcibus_to_physid(pdev->bus);
937 938
	pkg = topology_phys_to_logical_pkg(phys_id);

939 940 941
	box = pci_get_drvdata(pdev);
	if (!box) {
		for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
942 943
			if (uncore_extra_pci_dev[pkg].dev[i] == pdev) {
				uncore_extra_pci_dev[pkg].dev[i] = NULL;
944 945 946 947 948 949
				break;
			}
		}
		WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
		return;
	}
950

951
	pmu = box->pmu;
952
	if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
953 954
		return;

955
	pci_set_drvdata(pdev, NULL);
956 957 958
	pmu->boxes[pkg] = NULL;
	if (atomic_dec_return(&pmu->activeboxes) == 0)
		uncore_pmu_unregister(pmu);
959
	uncore_box_exit(box);
960 961 962 963 964
	kfree(box);
}

static int __init uncore_pci_init(void)
{
965
	size_t size;
966 967 968
	int ret;

	switch (boot_cpu_data.x86_model) {
969
	case 45: /* Sandy Bridge-EP */
970
		ret = snbep_uncore_pci_init();
971
		break;
972 973
	case 62: /* Ivy Bridge-EP */
		ret = ivbep_uncore_pci_init();
974
		break;
975 976 977
	case 63: /* Haswell-EP */
		ret = hswep_uncore_pci_init();
		break;
978
	case 79: /* BDX-EP */
979 980 981
	case 86: /* BDX-DE */
		ret = bdx_uncore_pci_init();
		break;
982
	case 42: /* Sandy Bridge */
983
		ret = snb_uncore_pci_init();
984 985
		break;
	case 58: /* Ivy Bridge */
986
		ret = ivb_uncore_pci_init();
987 988 989
		break;
	case 60: /* Haswell */
	case 69: /* Haswell Celeron */
990
		ret = hsw_uncore_pci_init();
991
		break;
992 993 994
	case 61: /* Broadwell */
		ret = bdw_uncore_pci_init();
		break;
995 996 997
	case 87: /* Knights Landing */
		ret = knl_uncore_pci_init();
		break;
998 999 1000
	case 94: /* SkyLake */
		ret = skl_uncore_pci_init();
		break;
1001 1002 1003 1004
	default:
		return 0;
	}

1005 1006 1007
	if (ret)
		return ret;

1008 1009 1010 1011
	size = max_packages * sizeof(struct pci_extra_dev);
	uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
	if (!uncore_extra_pci_dev) {
		ret = -ENOMEM;
1012
		goto err;
1013 1014 1015 1016 1017
	}

	ret = uncore_types_init(uncore_pci_uncores, false);
	if (ret)
		goto errtype;
1018 1019 1020 1021 1022

	uncore_pci_driver->probe = uncore_pci_probe;
	uncore_pci_driver->remove = uncore_pci_remove;

	ret = pci_register_driver(uncore_pci_driver);
1023
	if (ret)
1024
		goto errtype;
1025 1026 1027

	pcidrv_registered = true;
	return 0;
1028

1029
errtype:
1030
	uncore_types_exit(uncore_pci_uncores);
1031 1032
	kfree(uncore_extra_pci_dev);
	uncore_extra_pci_dev = NULL;
1033
	uncore_free_pcibus_map();
1034 1035
err:
	uncore_pci_uncores = empty_uncore;
1036 1037 1038 1039 1040 1041 1042 1043
	return ret;
}

static void __init uncore_pci_exit(void)
{
	if (pcidrv_registered) {
		pcidrv_registered = false;
		pci_unregister_driver(uncore_pci_driver);
1044
		uncore_types_exit(uncore_pci_uncores);
1045
		kfree(uncore_extra_pci_dev);
1046
		uncore_free_pcibus_map();
1047 1048 1049
	}
}

1050
static void uncore_cpu_dying(int cpu)
1051
{
1052
	struct intel_uncore_type *type, **types = uncore_msr_uncores;
1053 1054
	struct intel_uncore_pmu *pmu;
	struct intel_uncore_box *box;
1055
	int i, pkg;
1056

1057 1058 1059 1060 1061 1062 1063
	pkg = topology_logical_package_id(cpu);
	for (; *types; types++) {
		type = *types;
		pmu = type->pmus;
		for (i = 0; i < type->num_boxes; i++, pmu++) {
			box = pmu->boxes[pkg];
			if (box && atomic_dec_return(&box->refcnt) == 0)
1064
				uncore_box_exit(box);
1065 1066 1067 1068
		}
	}
}

1069
static void uncore_cpu_starting(int cpu, bool init)
1070
{
1071
	struct intel_uncore_type *type, **types = uncore_msr_uncores;
1072
	struct intel_uncore_pmu *pmu;
1073 1074
	struct intel_uncore_box *box;
	int i, pkg, ncpus = 1;
1075

1076 1077 1078 1079 1080 1081 1082
	if (init) {
		/*
		 * On init we get the number of online cpus in the package
		 * and set refcount for all of them.
		 */
		ncpus = cpumask_weight(topology_core_cpumask(cpu));
	}
1083

1084 1085 1086 1087 1088 1089 1090 1091 1092 1093
	pkg = topology_logical_package_id(cpu);
	for (; *types; types++) {
		type = *types;
		pmu = type->pmus;
		for (i = 0; i < type->num_boxes; i++, pmu++) {
			box = pmu->boxes[pkg];
			if (!box)
				continue;
			/* The first cpu on a package activates the box */
			if (atomic_add_return(ncpus, &box->refcnt) == ncpus)
1094
				uncore_box_init(box);
1095 1096 1097 1098
		}
	}
}

1099
static int uncore_cpu_prepare(int cpu)
1100
{
1101
	struct intel_uncore_type *type, **types = uncore_msr_uncores;
1102 1103
	struct intel_uncore_pmu *pmu;
	struct intel_uncore_box *box;
1104
	int i, pkg;
1105

1106 1107 1108 1109 1110 1111 1112 1113
	pkg = topology_logical_package_id(cpu);
	for (; *types; types++) {
		type = *types;
		pmu = type->pmus;
		for (i = 0; i < type->num_boxes; i++, pmu++) {
			if (pmu->boxes[pkg])
				continue;
			/* First cpu of a package allocates the box */
1114
			box = uncore_alloc_box(type, cpu_to_node(cpu));
1115 1116 1117
			if (!box)
				return -ENOMEM;
			box->pmu = pmu;
1118 1119
			box->pkgid = pkg;
			pmu->boxes[pkg] = box;
1120 1121 1122 1123 1124
		}
	}
	return 0;
}

1125 1126
static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
				   int new_cpu)
1127
{
1128
	struct intel_uncore_pmu *pmu = type->pmus;
1129
	struct intel_uncore_box *box;
1130
	int i, pkg;
1131

1132
	pkg = topology_logical_package_id(old_cpu < 0 ? new_cpu : old_cpu);
1133
	for (i = 0; i < type->num_boxes; i++, pmu++) {
1134
		box = pmu->boxes[pkg];
1135 1136
		if (!box)
			continue;
1137

1138 1139 1140 1141
		if (old_cpu < 0) {
			WARN_ON_ONCE(box->cpu != -1);
			box->cpu = new_cpu;
			continue;
1142
		}
1143 1144 1145 1146 1147 1148 1149 1150 1151

		WARN_ON_ONCE(box->cpu != old_cpu);
		box->cpu = -1;
		if (new_cpu < 0)
			continue;

		uncore_pmu_cancel_hrtimer(box);
		perf_pmu_migrate_context(&pmu->pmu, old_cpu, new_cpu);
		box->cpu = new_cpu;
1152 1153 1154
	}
}

1155 1156 1157 1158 1159 1160 1161
static void uncore_change_context(struct intel_uncore_type **uncores,
				  int old_cpu, int new_cpu)
{
	for (; *uncores; uncores++)
		uncore_change_type_ctx(*uncores, old_cpu, new_cpu);
}

1162
static void uncore_event_exit_cpu(int cpu)
1163
{
1164
	int target;
1165

1166
	/* Check if exiting cpu is used for collecting uncore events */
1167 1168 1169
	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
		return;

1170 1171
	/* Find a new cpu to collect uncore events */
	target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
1172

1173 1174
	/* Migrate uncore events to the new target */
	if (target < nr_cpu_ids)
1175
		cpumask_set_cpu(target, &uncore_cpu_mask);
1176 1177
	else
		target = -1;
1178

1179 1180
	uncore_change_context(uncore_msr_uncores, cpu, target);
	uncore_change_context(uncore_pci_uncores, cpu, target);
1181 1182
}

1183
static void uncore_event_init_cpu(int cpu)
1184
{
1185
	int target;
1186

1187 1188 1189 1190 1191 1192 1193
	/*
	 * Check if there is an online cpu in the package
	 * which collects uncore events already.
	 */
	target = cpumask_any_and(&uncore_cpu_mask, topology_core_cpumask(cpu));
	if (target < nr_cpu_ids)
		return;
1194 1195 1196

	cpumask_set_cpu(cpu, &uncore_cpu_mask);

1197 1198
	uncore_change_context(uncore_msr_uncores, -1, cpu);
	uncore_change_context(uncore_pci_uncores, -1, cpu);
1199 1200
}

1201 1202
static int uncore_cpu_notifier(struct notifier_block *self,
			       unsigned long action, void *hcpu)
1203 1204 1205 1206 1207
{
	unsigned int cpu = (long)hcpu;

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
1208 1209
		return notifier_from_errno(uncore_cpu_prepare(cpu));

1210
	case CPU_STARTING:
1211 1212 1213
		uncore_cpu_starting(cpu, false);
	case CPU_DOWN_FAILED:
		uncore_event_init_cpu(cpu);
1214
		break;
1215

1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227
	case CPU_UP_CANCELED:
	case CPU_DYING:
		uncore_cpu_dying(cpu);
		break;

	case CPU_DOWN_PREPARE:
		uncore_event_exit_cpu(cpu);
		break;
	}
	return NOTIFY_OK;
}

1228
static struct notifier_block uncore_cpu_nb = {
1229
	.notifier_call	= uncore_cpu_notifier,
1230 1231 1232 1233
	/*
	 * to migrate uncore events, our notifier should be executed
	 * before perf core's notifier.
	 */
1234
	.priority	= CPU_PRI_PERF + 1,
1235 1236
};

1237
static int __init type_pmu_register(struct intel_uncore_type *type)
1238
{
1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253
	int i, ret;

	for (i = 0; i < type->num_boxes; i++) {
		ret = uncore_pmu_register(&type->pmus[i]);
		if (ret)
			return ret;
	}
	return 0;
}

static int __init uncore_msr_pmus_register(void)
{
	struct intel_uncore_type **types = uncore_msr_uncores;
	int ret;

1254 1255
	for (; *types; types++) {
		ret = type_pmu_register(*types);
1256 1257 1258 1259
		if (ret)
			return ret;
	}
	return 0;
1260 1261 1262 1263
}

static int __init uncore_cpu_init(void)
{
1264
	int ret;
1265 1266

	switch (boot_cpu_data.x86_model) {
1267 1268 1269 1270
	case 26: /* Nehalem */
	case 30:
	case 37: /* Westmere */
	case 44:
1271
		nhm_uncore_cpu_init();
1272 1273
		break;
	case 42: /* Sandy Bridge */
1274
	case 58: /* Ivy Bridge */
1275 1276 1277 1278 1279
	case 60: /* Haswell */
	case 69: /* Haswell */
	case 70: /* Haswell */
	case 61: /* Broadwell */
	case 71: /* Broadwell */
1280
		snb_uncore_cpu_init();
1281
		break;
1282
	case 45: /* Sandy Bridge-EP */
1283
		snbep_uncore_cpu_init();
1284
		break;
1285 1286
	case 46: /* Nehalem-EX */
	case 47: /* Westmere-EX aka. Xeon E7 */
1287
		nhmex_uncore_cpu_init();
1288
		break;
1289 1290
	case 62: /* Ivy Bridge-EP */
		ivbep_uncore_cpu_init();
1291
		break;
1292 1293 1294
	case 63: /* Haswell-EP */
		hswep_uncore_cpu_init();
		break;
1295
	case 79: /* BDX-EP */
1296 1297 1298
	case 86: /* BDX-DE */
		bdx_uncore_cpu_init();
		break;
1299 1300 1301
	case 87: /* Knights Landing */
		knl_uncore_cpu_init();
		break;
1302 1303 1304 1305
	default:
		return 0;
	}

1306
	ret = uncore_types_init(uncore_msr_uncores, true);
1307 1308 1309 1310
	if (ret)
		goto err;

	ret = uncore_msr_pmus_register();
1311
	if (ret)
1312
		goto err;
1313
	return 0;
1314 1315 1316 1317
err:
	uncore_types_exit(uncore_msr_uncores);
	uncore_msr_uncores = empty_uncore;
	return ret;
1318 1319
}

1320
static void __init uncore_cpu_setup(void *dummy)
1321
{
1322
	uncore_cpu_starting(smp_processor_id(), true);
1323 1324
}

1325 1326 1327
/* Lazy to avoid allocation of a few bytes for the normal case */
static __initdata DECLARE_BITMAP(packages, MAX_LOCAL_APIC);

1328
static int __init uncore_cpumask_init(void)
1329
{
1330
	unsigned int cpu;
1331 1332

	for_each_online_cpu(cpu) {
1333 1334
		unsigned int pkg = topology_logical_package_id(cpu);
		int ret;
1335

1336
		if (test_and_set_bit(pkg, packages))
1337
			continue;
1338 1339 1340 1341 1342
		/*
		 * The first online cpu of each package takes the refcounts
		 * for all other online cpus in that package.
		 */
		ret = uncore_cpu_prepare(cpu);
1343
		if (ret)
1344
			return ret;
1345
		uncore_event_init_cpu(cpu);
1346
		smp_call_function_single(cpu, uncore_cpu_setup, NULL, 1);
1347
	}
1348
	__register_cpu_notifier(&uncore_cpu_nb);
1349
	return 0;
1350 1351
}

1352 1353 1354 1355 1356 1357 1358
static int __init intel_uncore_init(void)
{
	int ret;

	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
		return -ENODEV;

1359 1360 1361
	if (cpu_has_hypervisor)
		return -ENODEV;

1362 1363
	max_packages = topology_max_packages();

1364
	ret = uncore_pci_init();
1365
	if (ret)
1366
		return ret;
1367
	ret = uncore_cpu_init();
1368
	if (ret)
1369 1370 1371
		goto err;

	cpu_notifier_register_begin();
1372 1373
	ret = uncore_cpumask_init();
	if (ret)
1374 1375
		goto err;
	cpu_notifier_register_done();
1376
	return 0;
1377

1378
err:
1379 1380
	uncore_types_exit(uncore_msr_uncores);
	uncore_pci_exit();
1381
	cpu_notifier_register_done();
1382 1383 1384
	return ret;
}
device_initcall(intel_uncore_init);