msi.c 27.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8
/*
 * File:	msi.c
 * Purpose:	PCI Message Signaled Interrupt (MSI)
 *
 * Copyright (C) 2003-2004 Intel
 * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
 */

9
#include <linux/err.h>
L
Linus Torvalds 已提交
10 11 12
#include <linux/mm.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
13
#include <linux/export.h>
L
Linus Torvalds 已提交
14 15 16
#include <linux/ioport.h>
#include <linux/pci.h>
#include <linux/proc_fs.h>
17
#include <linux/msi.h>
D
Dan Williams 已提交
18
#include <linux/smp.h>
H
Hidetoshi Seto 已提交
19 20
#include <linux/errno.h>
#include <linux/io.h>
21
#include <linux/slab.h>
L
Linus Torvalds 已提交
22 23 24 25

#include "pci.h"

static int pci_msi_enable = 1;
26
int pci_msi_ignore_mask;
L
Linus Torvalds 已提交
27

28 29 30
#define msix_table_size(flags)	((flags & PCI_MSIX_FLAGS_QSIZE) + 1)


31 32
/* Arch hooks */

33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
struct msi_controller * __weak pcibios_msi_controller(struct pci_dev *dev)
{
	return NULL;
}

static struct msi_controller *pci_msi_controller(struct pci_dev *dev)
{
	struct msi_controller *msi_ctrl = dev->bus->msi;

	if (msi_ctrl)
		return msi_ctrl;

	return pcibios_msi_controller(dev);
}

48 49
int __weak arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
{
50
	struct msi_controller *chip = pci_msi_controller(dev);
51 52 53 54 55 56 57 58 59 60 61 62
	int err;

	if (!chip || !chip->setup_irq)
		return -EINVAL;

	err = chip->setup_irq(chip, dev, desc);
	if (err < 0)
		return err;

	irq_set_chip_data(desc->irq, chip);

	return 0;
63 64 65
}

void __weak arch_teardown_msi_irq(unsigned int irq)
66
{
67
	struct msi_controller *chip = irq_get_chip_data(irq);
68 69 70 71 72

	if (!chip || !chip->teardown_irq)
		return;

	chip->teardown_irq(chip, irq);
73 74
}

75
int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
76 77 78 79
{
	struct msi_desc *entry;
	int ret;

80 81 82 83 84 85 86
	/*
	 * If an architecture wants to support multiple MSI, it needs to
	 * override arch_setup_msi_irqs()
	 */
	if (type == PCI_CAP_ID_MSI && nvec > 1)
		return 1;

87 88
	list_for_each_entry(entry, &dev->msi_list, list) {
		ret = arch_setup_msi_irq(dev, entry);
89
		if (ret < 0)
90
			return ret;
91 92
		if (ret > 0)
			return -ENOSPC;
93 94 95 96
	}

	return 0;
}
97

98 99 100 101
/*
 * We have a default implementation available as a separate non-weak
 * function, as it is used by the Xen x86 PCI code
 */
102
void default_teardown_msi_irqs(struct pci_dev *dev)
103
{
104
	int i;
105 106
	struct msi_desc *entry;

107 108 109 110
	list_for_each_entry(entry, &dev->msi_list, list)
		if (entry->irq)
			for (i = 0; i < entry->nvec_used; i++)
				arch_teardown_msi_irq(entry->irq + i);
111 112
}

113 114 115 116
void __weak arch_teardown_msi_irqs(struct pci_dev *dev)
{
	return default_teardown_msi_irqs(dev);
}
117

118
static void default_restore_msi_irq(struct pci_dev *dev, int irq)
119 120 121 122 123 124 125 126 127 128 129 130 131 132
{
	struct msi_desc *entry;

	entry = NULL;
	if (dev->msix_enabled) {
		list_for_each_entry(entry, &dev->msi_list, list) {
			if (irq == entry->irq)
				break;
		}
	} else if (dev->msi_enabled)  {
		entry = irq_get_msi_desc(irq);
	}

	if (entry)
133
		__write_msi_msg(entry, &entry->msg);
134
}
135

136
void __weak arch_restore_msi_irqs(struct pci_dev *dev)
137
{
138
	return default_restore_msi_irqs(dev);
139
}
140

141
static void msi_set_enable(struct pci_dev *dev, int enable)
142 143 144
{
	u16 control;

145
	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);
146 147 148
	control &= ~PCI_MSI_FLAGS_ENABLE;
	if (enable)
		control |= PCI_MSI_FLAGS_ENABLE;
149
	pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
150 151
}

152
static void msix_clear_and_set_ctrl(struct pci_dev *dev, u16 clear, u16 set)
153
{
154
	u16 ctrl;
155

156 157 158 159
	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
	ctrl &= ~clear;
	ctrl |= set;
	pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, ctrl);
160 161
}

162 163
static inline __attribute_const__ u32 msi_mask(unsigned x)
{
164 165 166 167
	/* Don't shift by >= width of type */
	if (x >= 5)
		return 0xffffffff;
	return (1 << (1 << x)) - 1;
168 169
}

170 171 172 173 174 175
/*
 * PCI 2.3 does not specify mask bits for each MSI interrupt.  Attempting to
 * mask all MSI interrupts by clearing the MSI enable bit does not work
 * reliably as devices without an INTx disable bit will then generate a
 * level IRQ which will never be cleared.
 */
176
u32 __msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
L
Linus Torvalds 已提交
177
{
178
	u32 mask_bits = desc->masked;
L
Linus Torvalds 已提交
179

180
	if (pci_msi_ignore_mask || !desc->msi_attrib.maskbit)
181
		return 0;
182 183 184 185

	mask_bits &= ~mask;
	mask_bits |= flag;
	pci_write_config_dword(desc->dev, desc->mask_pos, mask_bits);
186 187 188 189 190 191

	return mask_bits;
}

static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
{
192
	desc->masked = __msi_mask_irq(desc, mask, flag);
193 194 195 196 197 198 199 200 201
}

/*
 * This internal function does not flush PCI writes to the device.
 * All users must ensure that they read from the device before either
 * assuming that the device state is up to date, or returning out of this
 * file.  This saves a few milliseconds when initialising devices with lots
 * of MSI-X interrupts.
 */
202
u32 __msix_mask_irq(struct msi_desc *desc, u32 flag)
203 204 205
{
	u32 mask_bits = desc->masked;
	unsigned offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
206
						PCI_MSIX_ENTRY_VECTOR_CTRL;
207 208 209 210

	if (pci_msi_ignore_mask)
		return 0;

211 212 213
	mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
	if (flag)
		mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
214
	writel(mask_bits, desc->mask_base + offset);
215 216 217 218 219 220

	return mask_bits;
}

static void msix_mask_irq(struct msi_desc *desc, u32 flag)
{
221
	desc->masked = __msix_mask_irq(desc, flag);
222
}
223

224
static void msi_set_mask_bit(struct irq_data *data, u32 flag)
225
{
226
	struct msi_desc *desc = irq_data_get_msi(data);
227

228 229 230 231
	if (desc->msi_attrib.is_msix) {
		msix_mask_irq(desc, flag);
		readl(desc->mask_base);		/* Flush write to device */
	} else {
232
		unsigned offset = data->irq - desc->irq;
233
		msi_mask_irq(desc, 1 << offset, flag << offset);
L
Linus Torvalds 已提交
234
	}
235 236
}

237
void mask_msi_irq(struct irq_data *data)
238
{
239
	msi_set_mask_bit(data, 1);
240 241
}

242
void unmask_msi_irq(struct irq_data *data)
243
{
244
	msi_set_mask_bit(data, 0);
L
Linus Torvalds 已提交
245 246
}

247 248 249 250
void default_restore_msi_irqs(struct pci_dev *dev)
{
	struct msi_desc *entry;

251
	list_for_each_entry(entry, &dev->msi_list, list)
252 253 254
		default_restore_msi_irq(dev, entry->irq);
}

255
void __read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
L
Linus Torvalds 已提交
256
{
257 258 259 260 261 262 263 264 265 266 267
	BUG_ON(entry->dev->current_state != PCI_D0);

	if (entry->msi_attrib.is_msix) {
		void __iomem *base = entry->mask_base +
			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;

		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
		msg->data = readl(base + PCI_MSIX_ENTRY_DATA);
	} else {
		struct pci_dev *dev = entry->dev;
268
		int pos = dev->msi_cap;
269 270
		u16 data;

271 272
		pci_read_config_dword(dev, pos + PCI_MSI_ADDRESS_LO,
				      &msg->address_lo);
273
		if (entry->msi_attrib.is_64) {
274 275
			pci_read_config_dword(dev, pos + PCI_MSI_ADDRESS_HI,
					      &msg->address_hi);
B
Bjorn Helgaas 已提交
276
			pci_read_config_word(dev, pos + PCI_MSI_DATA_64, &data);
277 278
		} else {
			msg->address_hi = 0;
B
Bjorn Helgaas 已提交
279
			pci_read_config_word(dev, pos + PCI_MSI_DATA_32, &data);
280 281 282 283 284 285 286
		}
		msg->data = data;
	}
}

void read_msi_msg(unsigned int irq, struct msi_msg *msg)
{
287
	struct msi_desc *entry = irq_get_msi_desc(irq);
288

289
	__read_msi_msg(entry, msg);
290 291
}

292
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
293 294
{
	/* Assert that the cache is valid, assuming that
295 296 297
	 * valid messages are not all-zeroes. */
	BUG_ON(!(entry->msg.address_hi | entry->msg.address_lo |
		 entry->msg.data));
298

299
	*msg = entry->msg;
300
}
L
Linus Torvalds 已提交
301

302
void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
303
{
304
	struct msi_desc *entry = irq_get_msi_desc(irq);
Y
Yinghai Lu 已提交
305

306
	__get_cached_msi_msg(entry, msg);
Y
Yinghai Lu 已提交
307
}
308
EXPORT_SYMBOL_GPL(get_cached_msi_msg);
Y
Yinghai Lu 已提交
309

310
void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
Y
Yinghai Lu 已提交
311
{
312 313 314
	if (entry->dev->current_state != PCI_D0) {
		/* Don't touch the hardware now */
	} else if (entry->msi_attrib.is_msix) {
315 316 317 318
		void __iomem *base;
		base = entry->mask_base +
			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;

319 320 321
		writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
		writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
		writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
322
	} else {
323
		struct pci_dev *dev = entry->dev;
324
		int pos = dev->msi_cap;
325 326
		u16 msgctl;

327
		pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
328 329
		msgctl &= ~PCI_MSI_FLAGS_QSIZE;
		msgctl |= entry->msi_attrib.multiple << 4;
330
		pci_write_config_word(dev, pos + PCI_MSI_FLAGS, msgctl);
331

332 333
		pci_write_config_dword(dev, pos + PCI_MSI_ADDRESS_LO,
				       msg->address_lo);
334
		if (entry->msi_attrib.is_64) {
335 336
			pci_write_config_dword(dev, pos + PCI_MSI_ADDRESS_HI,
					       msg->address_hi);
B
Bjorn Helgaas 已提交
337 338
			pci_write_config_word(dev, pos + PCI_MSI_DATA_64,
					      msg->data);
339
		} else {
B
Bjorn Helgaas 已提交
340 341
			pci_write_config_word(dev, pos + PCI_MSI_DATA_32,
					      msg->data);
342
		}
L
Linus Torvalds 已提交
343
	}
344
	entry->msg = *msg;
L
Linus Torvalds 已提交
345
}
346

Y
Yinghai Lu 已提交
347 348
void write_msi_msg(unsigned int irq, struct msi_msg *msg)
{
349
	struct msi_desc *entry = irq_get_msi_desc(irq);
Y
Yinghai Lu 已提交
350

351
	__write_msi_msg(entry, msg);
Y
Yinghai Lu 已提交
352
}
353
EXPORT_SYMBOL_GPL(write_msi_msg);
Y
Yinghai Lu 已提交
354

355 356 357
static void free_msi_irqs(struct pci_dev *dev)
{
	struct msi_desc *entry, *tmp;
358 359
	struct attribute **msi_attrs;
	struct device_attribute *dev_attr;
360
	int i, count = 0;
361

362 363 364 365
	list_for_each_entry(entry, &dev->msi_list, list)
		if (entry->irq)
			for (i = 0; i < entry->nvec_used; i++)
				BUG_ON(irq_has_action(entry->irq + i));
366 367 368 369 370 371 372 373

	arch_teardown_msi_irqs(dev);

	list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
		if (entry->msi_attrib.is_msix) {
			if (list_is_last(&entry->list, &dev->msi_list))
				iounmap(entry->mask_base);
		}
374

375 376 377
		list_del(&entry->list);
		kfree(entry);
	}
378 379 380 381

	if (dev->msi_irq_groups) {
		sysfs_remove_groups(&dev->dev.kobj, dev->msi_irq_groups);
		msi_attrs = dev->msi_irq_groups[0]->attrs;
382
		while (msi_attrs[count]) {
383 384 385 386 387 388 389 390 391 392 393
			dev_attr = container_of(msi_attrs[count],
						struct device_attribute, attr);
			kfree(dev_attr->attr.name);
			kfree(dev_attr);
			++count;
		}
		kfree(msi_attrs);
		kfree(dev->msi_irq_groups[0]);
		kfree(dev->msi_irq_groups);
		dev->msi_irq_groups = NULL;
	}
394
}
S
Satoru Takeuchi 已提交
395

396
static struct msi_desc *alloc_msi_entry(struct pci_dev *dev)
L
Linus Torvalds 已提交
397
{
398 399
	struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
	if (!desc)
L
Linus Torvalds 已提交
400 401
		return NULL;

402 403
	INIT_LIST_HEAD(&desc->list);
	desc->dev = dev;
L
Linus Torvalds 已提交
404

405
	return desc;
L
Linus Torvalds 已提交
406 407
}

408 409 410 411 412 413
static void pci_intx_for_msi(struct pci_dev *dev, int enable)
{
	if (!(dev->dev_flags & PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG))
		pci_intx(dev, enable);
}

414
static void __pci_restore_msi_state(struct pci_dev *dev)
415 416
{
	u16 control;
417
	struct msi_desc *entry;
418

419 420 421
	if (!dev->msi_enabled)
		return;

422
	entry = irq_get_msi_desc(dev->irq);
423

424
	pci_intx_for_msi(dev, 0);
425
	msi_set_enable(dev, 0);
426
	arch_restore_msi_irqs(dev);
427

428
	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);
429 430
	msi_mask_irq(entry, msi_mask(entry->msi_attrib.multi_cap),
		     entry->masked);
431
	control &= ~PCI_MSI_FLAGS_QSIZE;
432
	control |= (entry->msi_attrib.multiple << 4) | PCI_MSI_FLAGS_ENABLE;
433
	pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
434 435 436
}

static void __pci_restore_msix_state(struct pci_dev *dev)
437 438 439
{
	struct msi_desc *entry;

E
Eric W. Biederman 已提交
440 441
	if (!dev->msix_enabled)
		return;
442
	BUG_ON(list_empty(&dev->msi_list));
E
Eric W. Biederman 已提交
443

444
	/* route the table */
445
	pci_intx_for_msi(dev, 0);
446 447
	msix_clear_and_set_ctrl(dev, 0,
				PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
448

449
	arch_restore_msi_irqs(dev);
450
	list_for_each_entry(entry, &dev->msi_list, list)
451
		msix_mask_irq(entry, entry->masked);
452

453
	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
454
}
455 456 457 458 459 460

void pci_restore_msi_state(struct pci_dev *dev)
{
	__pci_restore_msi_state(dev);
	__pci_restore_msix_state(dev);
}
461
EXPORT_SYMBOL_GPL(pci_restore_msi_state);
462

463
static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
464 465
			     char *buf)
{
466 467 468
	struct msi_desc *entry;
	unsigned long irq;
	int retval;
469

470 471 472
	retval = kstrtoul(attr->attr.name, 10, &irq);
	if (retval)
		return retval;
473

474 475 476 477 478
	entry = irq_get_msi_desc(irq);
	if (entry)
		return sprintf(buf, "%s\n",
				entry->msi_attrib.is_msix ? "msix" : "msi");

479
	return -ENODEV;
480 481 482 483
}

static int populate_msi_sysfs(struct pci_dev *pdev)
{
484 485 486 487 488
	struct attribute **msi_attrs;
	struct attribute *msi_attr;
	struct device_attribute *msi_dev_attr;
	struct attribute_group *msi_irq_group;
	const struct attribute_group **msi_irq_groups;
489
	struct msi_desc *entry;
490 491
	int ret = -ENOMEM;
	int num_msi = 0;
492 493
	int count = 0;

494
	/* Determine how many msi entries we have */
495
	list_for_each_entry(entry, &pdev->msi_list, list)
496 497 498
		++num_msi;
	if (!num_msi)
		return 0;
499

500 501 502 503
	/* Dynamically create the MSI attributes for the PCI device */
	msi_attrs = kzalloc(sizeof(void *) * (num_msi + 1), GFP_KERNEL);
	if (!msi_attrs)
		return -ENOMEM;
504
	list_for_each_entry(entry, &pdev->msi_list, list) {
505
		msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
506
		if (!msi_dev_attr)
507
			goto error_attrs;
508
		msi_attrs[count] = &msi_dev_attr->attr;
509

510
		sysfs_attr_init(&msi_dev_attr->attr);
511 512 513 514
		msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d",
						    entry->irq);
		if (!msi_dev_attr->attr.name)
			goto error_attrs;
515 516 517
		msi_dev_attr->attr.mode = S_IRUGO;
		msi_dev_attr->show = msi_mode_show;
		++count;
518 519
	}

520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
	msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
	if (!msi_irq_group)
		goto error_attrs;
	msi_irq_group->name = "msi_irqs";
	msi_irq_group->attrs = msi_attrs;

	msi_irq_groups = kzalloc(sizeof(void *) * 2, GFP_KERNEL);
	if (!msi_irq_groups)
		goto error_irq_group;
	msi_irq_groups[0] = msi_irq_group;

	ret = sysfs_create_groups(&pdev->dev.kobj, msi_irq_groups);
	if (ret)
		goto error_irq_groups;
	pdev->msi_irq_groups = msi_irq_groups;

536 537
	return 0;

538 539 540 541 542 543 544 545 546 547 548 549 550
error_irq_groups:
	kfree(msi_irq_groups);
error_irq_group:
	kfree(msi_irq_group);
error_attrs:
	count = 0;
	msi_attr = msi_attrs[count];
	while (msi_attr) {
		msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
		kfree(msi_attr->name);
		kfree(msi_dev_attr);
		++count;
		msi_attr = msi_attrs[count];
551
	}
552
	kfree(msi_attrs);
553 554 555
	return ret;
}

556
static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec)
557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
{
	u16 control;
	struct msi_desc *entry;

	/* MSI Entry Initialization */
	entry = alloc_msi_entry(dev);
	if (!entry)
		return NULL;

	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);

	entry->msi_attrib.is_msix	= 0;
	entry->msi_attrib.is_64		= !!(control & PCI_MSI_FLAGS_64BIT);
	entry->msi_attrib.entry_nr	= 0;
	entry->msi_attrib.maskbit	= !!(control & PCI_MSI_FLAGS_MASKBIT);
	entry->msi_attrib.default_irq	= dev->irq;	/* Save IOAPIC IRQ */
	entry->msi_attrib.multi_cap	= (control & PCI_MSI_FLAGS_QMASK) >> 1;
574 575
	entry->msi_attrib.multiple	= ilog2(__roundup_pow_of_two(nvec));
	entry->nvec_used		= nvec;
576 577 578 579 580 581 582 583 584 585 586 587 588

	if (control & PCI_MSI_FLAGS_64BIT)
		entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64;
	else
		entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_32;

	/* Save the initial mask status */
	if (entry->msi_attrib.maskbit)
		pci_read_config_dword(dev, entry->mask_pos, &entry->masked);

	return entry;
}

L
Linus Torvalds 已提交
589 590 591
/**
 * msi_capability_init - configure device's MSI capability structure
 * @dev: pointer to the pci_dev data structure of MSI device function
592
 * @nvec: number of interrupts to allocate
L
Linus Torvalds 已提交
593
 *
594 595 596 597 598 599 600
 * Setup the MSI capability structure of the device with the requested
 * number of interrupts.  A return value of zero indicates the successful
 * setup of an entry with the new MSI irq.  A negative return value indicates
 * an error, and a positive return value indicates the number of interrupts
 * which could have been allocated.
 */
static int msi_capability_init(struct pci_dev *dev, int nvec)
L
Linus Torvalds 已提交
601 602
{
	struct msi_desc *entry;
603
	int ret;
604
	unsigned mask;
L
Linus Torvalds 已提交
605

606
	msi_set_enable(dev, 0);	/* Disable MSI during set up */
607

608
	entry = msi_setup_entry(dev, nvec);
609 610
	if (!entry)
		return -ENOMEM;
611

612
	/* All MSIs are unmasked by default, Mask them all */
613
	mask = msi_mask(entry->msi_attrib.multi_cap);
614 615
	msi_mask_irq(entry, mask, mask);

616
	list_add_tail(&entry->list, &dev->msi_list);
617

L
Linus Torvalds 已提交
618
	/* Configure MSI capability structure */
619
	ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
620
	if (ret) {
621
		msi_mask_irq(entry, mask, ~mask);
622
		free_msi_irqs(dev);
623
		return ret;
624
	}
625

626 627 628 629 630 631 632
	ret = populate_msi_sysfs(dev);
	if (ret) {
		msi_mask_irq(entry, mask, ~mask);
		free_msi_irqs(dev);
		return ret;
	}

L
Linus Torvalds 已提交
633
	/* Set MSI enabled bits	 */
634
	pci_intx_for_msi(dev, 0);
635
	msi_set_enable(dev, 1);
636
	dev->msi_enabled = 1;
L
Linus Torvalds 已提交
637

638
	dev->irq = entry->irq;
L
Linus Torvalds 已提交
639 640 641
	return 0;
}

642
static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries)
643
{
644
	resource_size_t phys_addr;
645 646 647
	u32 table_offset;
	u8 bir;

648 649
	pci_read_config_dword(dev, dev->msix_cap + PCI_MSIX_TABLE,
			      &table_offset);
650 651
	bir = (u8)(table_offset & PCI_MSIX_TABLE_BIR);
	table_offset &= PCI_MSIX_TABLE_OFFSET;
652 653 654 655 656
	phys_addr = pci_resource_start(dev, bir) + table_offset;

	return ioremap_nocache(phys_addr, nr_entries * PCI_MSIX_ENTRY_SIZE);
}

657 658
static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
			      struct msix_entry *entries, int nvec)
659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
{
	struct msi_desc *entry;
	int i;

	for (i = 0; i < nvec; i++) {
		entry = alloc_msi_entry(dev);
		if (!entry) {
			if (!i)
				iounmap(base);
			else
				free_msi_irqs(dev);
			/* No enough memory. Don't try again */
			return -ENOMEM;
		}

		entry->msi_attrib.is_msix	= 1;
		entry->msi_attrib.is_64		= 1;
		entry->msi_attrib.entry_nr	= entries[i].entry;
		entry->msi_attrib.default_irq	= dev->irq;
		entry->mask_base		= base;
679
		entry->nvec_used		= 1;
680 681 682 683 684 685 686

		list_add_tail(&entry->list, &dev->msi_list);
	}

	return 0;
}

687
static void msix_program_entries(struct pci_dev *dev,
688
				 struct msix_entry *entries)
689 690 691 692 693 694 695 696 697
{
	struct msi_desc *entry;
	int i = 0;

	list_for_each_entry(entry, &dev->msi_list, list) {
		int offset = entries[i].entry * PCI_MSIX_ENTRY_SIZE +
						PCI_MSIX_ENTRY_VECTOR_CTRL;

		entries[i].vector = entry->irq;
698
		irq_set_msi_desc(entry->irq, entry);
699 700 701 702 703 704
		entry->masked = readl(entry->mask_base + offset);
		msix_mask_irq(entry, 1);
		i++;
	}
}

L
Linus Torvalds 已提交
705 706 707
/**
 * msix_capability_init - configure device's MSI-X capability
 * @dev: pointer to the pci_dev data structure of MSI-X device function
R
Randy Dunlap 已提交
708 709
 * @entries: pointer to an array of struct msix_entry entries
 * @nvec: number of @entries
L
Linus Torvalds 已提交
710
 *
711
 * Setup the MSI-X capability structure of device function with a
712 713
 * single MSI-X irq. A return of zero indicates the successful setup of
 * requested MSI-X entries with allocated irqs or non-zero for otherwise.
L
Linus Torvalds 已提交
714 715 716 717
 **/
static int msix_capability_init(struct pci_dev *dev,
				struct msix_entry *entries, int nvec)
{
718
	int ret;
719
	u16 control;
L
Linus Torvalds 已提交
720 721
	void __iomem *base;

722
	/* Ensure MSI-X is disabled while it is set up */
723
	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
724

725
	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
L
Linus Torvalds 已提交
726
	/* Request & Map MSI-X table region */
727
	base = msix_map_region(dev, msix_table_size(control));
728
	if (!base)
L
Linus Torvalds 已提交
729 730
		return -ENOMEM;

731
	ret = msix_setup_entries(dev, base, entries, nvec);
732 733
	if (ret)
		return ret;
734 735

	ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX);
736
	if (ret)
737
		goto out_avail;
738

739 740 741 742 743
	/*
	 * Some devices require MSI-X to be enabled before we can touch the
	 * MSI-X registers.  We need to mask all the vectors to prevent
	 * interrupts coming in before they're fully set up.
	 */
744 745
	msix_clear_and_set_ctrl(dev, 0,
				PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE);
746

747
	msix_program_entries(dev, entries);
748

749
	ret = populate_msi_sysfs(dev);
750 751
	if (ret)
		goto out_free;
752

753
	/* Set MSI-X enabled bits and unmask the function */
754
	pci_intx_for_msi(dev, 0);
755
	dev->msix_enabled = 1;
L
Linus Torvalds 已提交
756

757
	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
758

L
Linus Torvalds 已提交
759
	return 0;
760

761
out_avail:
762 763 764 765 766
	if (ret < 0) {
		/*
		 * If we had some success, report the number of irqs
		 * we succeeded in setting up.
		 */
767
		struct msi_desc *entry;
768 769 770 771 772 773 774 775 776 777
		int avail = 0;

		list_for_each_entry(entry, &dev->msi_list, list) {
			if (entry->irq != 0)
				avail++;
		}
		if (avail != 0)
			ret = avail;
	}

778
out_free:
779 780 781
	free_msi_irqs(dev);

	return ret;
L
Linus Torvalds 已提交
782 783
}

784
/**
785
 * pci_msi_supported - check whether MSI may be enabled on a device
786
 * @dev: pointer to the pci_dev data structure of MSI device function
787
 * @nvec: how many MSIs have been requested ?
788
 *
789
 * Look at global flags, the device itself, and its parent buses
790
 * to determine if MSI/-X are supported for the device. If MSI/-X is
791
 * supported return 1, else return 0.
792
 **/
793
static int pci_msi_supported(struct pci_dev *dev, int nvec)
794 795 796
{
	struct pci_bus *bus;

797
	/* MSI must be globally enabled and supported by the device */
798
	if (!pci_msi_enable)
799
		return 0;
800 801

	if (!dev || dev->no_msi || dev->current_state != PCI_D0)
802
		return 0;
803

804 805 806 807 808 809
	/*
	 * You can't ask to have 0 or less MSIs configured.
	 *  a) it's stupid ..
	 *  b) the list manipulation code assumes nvec >= 1.
	 */
	if (nvec < 1)
810
		return 0;
811

H
Hidetoshi Seto 已提交
812 813 814
	/*
	 * Any bridge which does NOT route MSI transactions from its
	 * secondary bus to its primary bus must set NO_MSI flag on
815 816 817 818
	 * the secondary pci_bus.
	 * We expect only arch-specific PCI host bus controller driver
	 * or quirks for specific PCI bridges to be setting NO_MSI.
	 */
819 820
	for (bus = dev->bus; bus; bus = bus->parent)
		if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
821
			return 0;
822

823
	return 1;
824 825
}

826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850
/**
 * pci_msi_vec_count - Return the number of MSI vectors a device can send
 * @dev: device to report about
 *
 * This function returns the number of MSI vectors a device requested via
 * Multiple Message Capable register. It returns a negative errno if the
 * device is not capable sending MSI interrupts. Otherwise, the call succeeds
 * and returns a power of two, up to a maximum of 2^5 (32), according to the
 * MSI specification.
 **/
int pci_msi_vec_count(struct pci_dev *dev)
{
	int ret;
	u16 msgctl;

	if (!dev->msi_cap)
		return -EINVAL;

	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &msgctl);
	ret = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1);

	return ret;
}
EXPORT_SYMBOL(pci_msi_vec_count);

851
void pci_msi_shutdown(struct pci_dev *dev)
L
Linus Torvalds 已提交
852
{
853 854
	struct msi_desc *desc;
	u32 mask;
L
Linus Torvalds 已提交
855

856
	if (!pci_msi_enable || !dev || !dev->msi_enabled)
E
Eric W. Biederman 已提交
857 858
		return;

859 860 861
	BUG_ON(list_empty(&dev->msi_list));
	desc = list_first_entry(&dev->msi_list, struct msi_desc, list);

862
	msi_set_enable(dev, 0);
863
	pci_intx_for_msi(dev, 1);
864
	dev->msi_enabled = 0;
865

866
	/* Return the device with MSI unmasked as initial states */
867
	mask = msi_mask(desc->msi_attrib.multi_cap);
868
	/* Keep cached state to be restored */
869
	__msi_mask_irq(desc, mask, ~mask);
870 871

	/* Restore dev->irq to its default pin-assertion irq */
872
	dev->irq = desc->msi_attrib.default_irq;
873
}
874

H
Hidetoshi Seto 已提交
875
void pci_disable_msi(struct pci_dev *dev)
876 877 878 879 880
{
	if (!pci_msi_enable || !dev || !dev->msi_enabled)
		return;

	pci_msi_shutdown(dev);
881
	free_msi_irqs(dev);
L
Linus Torvalds 已提交
882
}
883
EXPORT_SYMBOL(pci_disable_msi);
L
Linus Torvalds 已提交
884

885
/**
886
 * pci_msix_vec_count - return the number of device's MSI-X table entries
887
 * @dev: pointer to the pci_dev data structure of MSI-X device function
888 889 890 891 892 893
 * This function returns the number of device's MSI-X table entries and
 * therefore the number of MSI-X vectors device is capable of sending.
 * It returns a negative errno if the device is not capable of sending MSI-X
 * interrupts.
 **/
int pci_msix_vec_count(struct pci_dev *dev)
894 895 896
{
	u16 control;

897
	if (!dev->msix_cap)
898
		return -EINVAL;
899

900
	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
901
	return msix_table_size(control);
902
}
903
EXPORT_SYMBOL(pci_msix_vec_count);
904

L
Linus Torvalds 已提交
905 906 907
/**
 * pci_enable_msix - configure device's MSI-X capability structure
 * @dev: pointer to the pci_dev data structure of MSI-X device function
908
 * @entries: pointer to an array of MSI-X entries
909
 * @nvec: number of MSI-X irqs requested for allocation by device driver
L
Linus Torvalds 已提交
910 911
 *
 * Setup the MSI-X capability structure of device function with the number
912
 * of requested irqs upon its software driver call to request for
L
Linus Torvalds 已提交
913 914
 * MSI-X mode enabled on its hardware device function. A return of zero
 * indicates the successful configuration of MSI-X capability structure
915
 * with new allocated MSI-X irqs. A return of < 0 indicates a failure.
L
Linus Torvalds 已提交
916
 * Or a return of > 0 indicates that driver request is exceeding the number
917 918
 * of irqs or MSI-X vectors available. Driver should use the returned value to
 * re-send its request.
L
Linus Torvalds 已提交
919
 **/
H
Hidetoshi Seto 已提交
920
int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
L
Linus Torvalds 已提交
921
{
922
	int nr_entries;
E
Eric W. Biederman 已提交
923
	int i, j;
L
Linus Torvalds 已提交
924

925 926
	if (!pci_msi_supported(dev, nvec))
		return -EINVAL;
927

928 929 930
	if (!entries)
		return -EINVAL;

931 932 933
	nr_entries = pci_msix_vec_count(dev);
	if (nr_entries < 0)
		return nr_entries;
L
Linus Torvalds 已提交
934
	if (nvec > nr_entries)
935
		return nr_entries;
L
Linus Torvalds 已提交
936 937 938 939 940 941 942 943 944 945

	/* Check for any invalid entries */
	for (i = 0; i < nvec; i++) {
		if (entries[i].entry >= nr_entries)
			return -EINVAL;		/* invalid entry */
		for (j = i + 1; j < nvec; j++) {
			if (entries[i].entry == entries[j].entry)
				return -EINVAL;	/* duplicate entry */
		}
	}
E
Eric W. Biederman 已提交
946
	WARN_ON(!!dev->msix_enabled);
947

948
	/* Check whether driver already requested for MSI irq */
H
Hidetoshi Seto 已提交
949
	if (dev->msi_enabled) {
950
		dev_info(&dev->dev, "can't enable MSI-X (MSI IRQ already assigned)\n");
L
Linus Torvalds 已提交
951 952
		return -EINVAL;
	}
953
	return msix_capability_init(dev, entries, nvec);
L
Linus Torvalds 已提交
954
}
955
EXPORT_SYMBOL(pci_enable_msix);
L
Linus Torvalds 已提交
956

H
Hidetoshi Seto 已提交
957
void pci_msix_shutdown(struct pci_dev *dev)
958
{
959 960
	struct msi_desc *entry;

961
	if (!pci_msi_enable || !dev || !dev->msix_enabled)
E
Eric W. Biederman 已提交
962 963
		return;

964 965 966
	/* Return the device with MSI-X masked as initial states */
	list_for_each_entry(entry, &dev->msi_list, list) {
		/* Keep cached states to be restored */
967
		__msix_mask_irq(entry, 1);
968 969
	}

970
	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
971
	pci_intx_for_msi(dev, 1);
972
	dev->msix_enabled = 0;
973
}
974

H
Hidetoshi Seto 已提交
975
void pci_disable_msix(struct pci_dev *dev)
976 977 978 979 980
{
	if (!pci_msi_enable || !dev || !dev->msix_enabled)
		return;

	pci_msix_shutdown(dev);
981
	free_msi_irqs(dev);
L
Linus Torvalds 已提交
982
}
983
EXPORT_SYMBOL(pci_disable_msix);
L
Linus Torvalds 已提交
984

985 986 987 988
void pci_no_msi(void)
{
	pci_msi_enable = 0;
}
989

990 991 992 993 994 995 996
/**
 * pci_msi_enabled - is MSI enabled?
 *
 * Returns true if MSI has not been disabled by the command-line option
 * pci=nomsi.
 **/
int pci_msi_enabled(void)
997
{
998
	return pci_msi_enable;
999
}
1000
EXPORT_SYMBOL(pci_msi_enabled);
1001

1002
void pci_msi_init_pci_dev(struct pci_dev *dev)
1003
{
1004
	INIT_LIST_HEAD(&dev->msi_list);
1005 1006 1007 1008 1009

	/* Disable the msi hardware to avoid screaming interrupts
	 * during boot.  This is the power on reset default so
	 * usually this should be a noop.
	 */
1010 1011 1012 1013 1014 1015
	dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
	if (dev->msi_cap)
		msi_set_enable(dev, 0);

	dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
	if (dev->msix_cap)
1016
		msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
1017
}
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032

/**
 * pci_enable_msi_range - configure device's MSI capability structure
 * @dev: device to configure
 * @minvec: minimal number of interrupts to configure
 * @maxvec: maximum number of interrupts to configure
 *
 * This function tries to allocate a maximum possible number of interrupts in a
 * range between @minvec and @maxvec. It returns a negative errno if an error
 * occurs. If it succeeds, it returns the actual number of interrupts allocated
 * and updates the @dev's irq member to the lowest new interrupt number;
 * the other interrupt numbers allocated to this device are consecutive.
 **/
int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
{
1033
	int nvec;
1034 1035
	int rc;

1036 1037
	if (!pci_msi_supported(dev, minvec))
		return -EINVAL;
1038 1039 1040 1041 1042 1043 1044 1045 1046 1047

	WARN_ON(!!dev->msi_enabled);

	/* Check whether driver already requested MSI-X irqs */
	if (dev->msix_enabled) {
		dev_info(&dev->dev,
			 "can't enable MSI (MSI-X already enabled)\n");
		return -EINVAL;
	}

1048 1049 1050
	if (maxvec < minvec)
		return -ERANGE;

1051 1052 1053 1054 1055 1056 1057 1058
	nvec = pci_msi_vec_count(dev);
	if (nvec < 0)
		return nvec;
	else if (nvec < minvec)
		return -EINVAL;
	else if (nvec > maxvec)
		nvec = maxvec;

1059
	do {
1060
		rc = msi_capability_init(dev, nvec);
1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111
		if (rc < 0) {
			return rc;
		} else if (rc > 0) {
			if (rc < minvec)
				return -ENOSPC;
			nvec = rc;
		}
	} while (rc);

	return nvec;
}
EXPORT_SYMBOL(pci_enable_msi_range);

/**
 * pci_enable_msix_range - configure device's MSI-X capability structure
 * @dev: pointer to the pci_dev data structure of MSI-X device function
 * @entries: pointer to an array of MSI-X entries
 * @minvec: minimum number of MSI-X irqs requested
 * @maxvec: maximum number of MSI-X irqs requested
 *
 * Setup the MSI-X capability structure of device function with a maximum
 * possible number of interrupts in the range between @minvec and @maxvec
 * upon its software driver call to request for MSI-X mode enabled on its
 * hardware device function. It returns a negative errno if an error occurs.
 * If it succeeds, it returns the actual number of interrupts allocated and
 * indicates the successful configuration of MSI-X capability structure
 * with new allocated MSI-X interrupts.
 **/
int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
			       int minvec, int maxvec)
{
	int nvec = maxvec;
	int rc;

	if (maxvec < minvec)
		return -ERANGE;

	do {
		rc = pci_enable_msix(dev, entries, nvec);
		if (rc < 0) {
			return rc;
		} else if (rc > 0) {
			if (rc < minvec)
				return -ENOSPC;
			nvec = rc;
		}
	} while (rc);

	return nvec;
}
EXPORT_SYMBOL(pci_enable_msix_range);