msi.c 29.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8
/*
 * File:	msi.c
 * Purpose:	PCI Message Signaled Interrupt (MSI)
 *
 * Copyright (C) 2003-2004 Intel
 * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
 */

9
#include <linux/err.h>
L
Linus Torvalds 已提交
10 11 12
#include <linux/mm.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
13
#include <linux/export.h>
L
Linus Torvalds 已提交
14 15 16
#include <linux/ioport.h>
#include <linux/pci.h>
#include <linux/proc_fs.h>
17
#include <linux/msi.h>
D
Dan Williams 已提交
18
#include <linux/smp.h>
H
Hidetoshi Seto 已提交
19 20
#include <linux/errno.h>
#include <linux/io.h>
21
#include <linux/slab.h>
L
Linus Torvalds 已提交
22 23 24 25 26

#include "pci.h"

static int pci_msi_enable = 1;

27 28 29
#define msix_table_size(flags)	((flags & PCI_MSIX_FLAGS_QSIZE) + 1)


30 31
/* Arch hooks */

32 33
int __weak arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
{
34 35 36 37 38 39 40 41 42 43 44 45 46
	struct msi_chip *chip = dev->bus->msi;
	int err;

	if (!chip || !chip->setup_irq)
		return -EINVAL;

	err = chip->setup_irq(chip, dev, desc);
	if (err < 0)
		return err;

	irq_set_chip_data(desc->irq, chip);

	return 0;
47 48 49
}

void __weak arch_teardown_msi_irq(unsigned int irq)
50
{
51 52 53 54 55 56
	struct msi_chip *chip = irq_get_chip_data(irq);

	if (!chip || !chip->teardown_irq)
		return;

	chip->teardown_irq(chip, irq);
57 58
}

59 60
int __weak arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
{
61 62 63 64 65 66
	struct msi_chip *chip = dev->bus->msi;

	if (!chip || !chip->check_device)
		return 0;

	return chip->check_device(chip, dev, nvec, type);
67
}
68

69
int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
70 71 72 73
{
	struct msi_desc *entry;
	int ret;

74 75 76 77 78 79 80
	/*
	 * If an architecture wants to support multiple MSI, it needs to
	 * override arch_setup_msi_irqs()
	 */
	if (type == PCI_CAP_ID_MSI && nvec > 1)
		return 1;

81 82
	list_for_each_entry(entry, &dev->msi_list, list) {
		ret = arch_setup_msi_irq(dev, entry);
83
		if (ret < 0)
84
			return ret;
85 86
		if (ret > 0)
			return -ENOSPC;
87 88 89 90
	}

	return 0;
}
91

92 93 94 95
/*
 * We have a default implementation available as a separate non-weak
 * function, as it is used by the Xen x86 PCI code
 */
96
void default_teardown_msi_irqs(struct pci_dev *dev)
97 98 99 100
{
	struct msi_desc *entry;

	list_for_each_entry(entry, &dev->msi_list, list) {
101 102 103
		int i, nvec;
		if (entry->irq == 0)
			continue;
104 105 106 107
		if (entry->nvec_used)
			nvec = entry->nvec_used;
		else
			nvec = 1 << entry->msi_attrib.multiple;
108 109
		for (i = 0; i < nvec; i++)
			arch_teardown_msi_irq(entry->irq + i);
110 111 112
	}
}

113 114 115 116
void __weak arch_teardown_msi_irqs(struct pci_dev *dev)
{
	return default_teardown_msi_irqs(dev);
}
117

118
static void default_restore_msi_irq(struct pci_dev *dev, int irq)
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
{
	struct msi_desc *entry;

	entry = NULL;
	if (dev->msix_enabled) {
		list_for_each_entry(entry, &dev->msi_list, list) {
			if (irq == entry->irq)
				break;
		}
	} else if (dev->msi_enabled)  {
		entry = irq_get_msi_desc(irq);
	}

	if (entry)
		write_msi_msg(irq, &entry->msg);
}
135

136
void __weak arch_restore_msi_irqs(struct pci_dev *dev)
137
{
138
	return default_restore_msi_irqs(dev);
139
}
140

141
static void msi_set_enable(struct pci_dev *dev, int enable)
142 143 144
{
	u16 control;

145
	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);
146 147 148
	control &= ~PCI_MSI_FLAGS_ENABLE;
	if (enable)
		control |= PCI_MSI_FLAGS_ENABLE;
149
	pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
150 151
}

152
static void msix_clear_and_set_ctrl(struct pci_dev *dev, u16 clear, u16 set)
153
{
154
	u16 ctrl;
155

156 157 158 159
	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
	ctrl &= ~clear;
	ctrl |= set;
	pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, ctrl);
160 161
}

162 163
static inline __attribute_const__ u32 msi_mask(unsigned x)
{
164 165 166 167
	/* Don't shift by >= width of type */
	if (x >= 5)
		return 0xffffffff;
	return (1 << (1 << x)) - 1;
168 169
}

170
static inline __attribute_const__ u32 msi_capable_mask(u16 control)
M
Mitch Williams 已提交
171
{
172 173
	return msi_mask((control >> 1) & 7);
}
M
Mitch Williams 已提交
174

175 176 177 178 179 180
/*
 * PCI 2.3 does not specify mask bits for each MSI interrupt.  Attempting to
 * mask all MSI interrupts by clearing the MSI enable bit does not work
 * reliably as devices without an INTx disable bit will then generate a
 * level IRQ which will never be cleared.
 */
181
u32 default_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
L
Linus Torvalds 已提交
182
{
183
	u32 mask_bits = desc->masked;
L
Linus Torvalds 已提交
184

185
	if (!desc->msi_attrib.maskbit)
186
		return 0;
187 188 189 190

	mask_bits &= ~mask;
	mask_bits |= flag;
	pci_write_config_dword(desc->dev, desc->mask_pos, mask_bits);
191 192 193 194

	return mask_bits;
}

195 196 197 198 199
__weak u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
{
	return default_msi_mask_irq(desc, mask, flag);
}

200 201
static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
{
202
	desc->masked = arch_msi_mask_irq(desc, mask, flag);
203 204 205 206 207 208 209 210 211
}

/*
 * This internal function does not flush PCI writes to the device.
 * All users must ensure that they read from the device before either
 * assuming that the device state is up to date, or returning out of this
 * file.  This saves a few milliseconds when initialising devices with lots
 * of MSI-X interrupts.
 */
212
u32 default_msix_mask_irq(struct msi_desc *desc, u32 flag)
213 214 215
{
	u32 mask_bits = desc->masked;
	unsigned offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
216
						PCI_MSIX_ENTRY_VECTOR_CTRL;
217 218 219
	mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
	if (flag)
		mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
220
	writel(mask_bits, desc->mask_base + offset);
221 222 223 224

	return mask_bits;
}

225 226 227 228 229
__weak u32 arch_msix_mask_irq(struct msi_desc *desc, u32 flag)
{
	return default_msix_mask_irq(desc, flag);
}

230 231
static void msix_mask_irq(struct msi_desc *desc, u32 flag)
{
232
	desc->masked = arch_msix_mask_irq(desc, flag);
233
}
234

235
static void msi_set_mask_bit(struct irq_data *data, u32 flag)
236
{
237
	struct msi_desc *desc = irq_data_get_msi(data);
238

239 240 241 242
	if (desc->msi_attrib.is_msix) {
		msix_mask_irq(desc, flag);
		readl(desc->mask_base);		/* Flush write to device */
	} else {
243
		unsigned offset = data->irq - desc->dev->irq;
244
		msi_mask_irq(desc, 1 << offset, flag << offset);
L
Linus Torvalds 已提交
245
	}
246 247
}

248
void mask_msi_irq(struct irq_data *data)
249
{
250
	msi_set_mask_bit(data, 1);
251 252
}

253
void unmask_msi_irq(struct irq_data *data)
254
{
255
	msi_set_mask_bit(data, 0);
L
Linus Torvalds 已提交
256 257
}

258 259 260 261 262 263 264 265 266
void default_restore_msi_irqs(struct pci_dev *dev)
{
	struct msi_desc *entry;

	list_for_each_entry(entry, &dev->msi_list, list) {
		default_restore_msi_irq(dev, entry->irq);
	}
}

267
void __read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
L
Linus Torvalds 已提交
268
{
269 270 271 272 273 274 275 276 277 278 279
	BUG_ON(entry->dev->current_state != PCI_D0);

	if (entry->msi_attrib.is_msix) {
		void __iomem *base = entry->mask_base +
			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;

		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
		msg->data = readl(base + PCI_MSIX_ENTRY_DATA);
	} else {
		struct pci_dev *dev = entry->dev;
280
		int pos = dev->msi_cap;
281 282
		u16 data;

283 284
		pci_read_config_dword(dev, pos + PCI_MSI_ADDRESS_LO,
				      &msg->address_lo);
285
		if (entry->msi_attrib.is_64) {
286 287
			pci_read_config_dword(dev, pos + PCI_MSI_ADDRESS_HI,
					      &msg->address_hi);
B
Bjorn Helgaas 已提交
288
			pci_read_config_word(dev, pos + PCI_MSI_DATA_64, &data);
289 290
		} else {
			msg->address_hi = 0;
B
Bjorn Helgaas 已提交
291
			pci_read_config_word(dev, pos + PCI_MSI_DATA_32, &data);
292 293 294 295 296 297 298
		}
		msg->data = data;
	}
}

void read_msi_msg(unsigned int irq, struct msi_msg *msg)
{
299
	struct msi_desc *entry = irq_get_msi_desc(irq);
300

301
	__read_msi_msg(entry, msg);
302 303
}

304
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
305 306
{
	/* Assert that the cache is valid, assuming that
307 308 309
	 * valid messages are not all-zeroes. */
	BUG_ON(!(entry->msg.address_hi | entry->msg.address_lo |
		 entry->msg.data));
310

311
	*msg = entry->msg;
312
}
L
Linus Torvalds 已提交
313

314
void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
315
{
316
	struct msi_desc *entry = irq_get_msi_desc(irq);
Y
Yinghai Lu 已提交
317

318
	__get_cached_msi_msg(entry, msg);
Y
Yinghai Lu 已提交
319 320
}

321
void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
Y
Yinghai Lu 已提交
322
{
323 324 325
	if (entry->dev->current_state != PCI_D0) {
		/* Don't touch the hardware now */
	} else if (entry->msi_attrib.is_msix) {
326 327 328 329
		void __iomem *base;
		base = entry->mask_base +
			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;

330 331 332
		writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
		writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
		writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
333
	} else {
334
		struct pci_dev *dev = entry->dev;
335
		int pos = dev->msi_cap;
336 337
		u16 msgctl;

338
		pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
339 340
		msgctl &= ~PCI_MSI_FLAGS_QSIZE;
		msgctl |= entry->msi_attrib.multiple << 4;
341
		pci_write_config_word(dev, pos + PCI_MSI_FLAGS, msgctl);
342

343 344
		pci_write_config_dword(dev, pos + PCI_MSI_ADDRESS_LO,
				       msg->address_lo);
345
		if (entry->msi_attrib.is_64) {
346 347
			pci_write_config_dword(dev, pos + PCI_MSI_ADDRESS_HI,
					       msg->address_hi);
B
Bjorn Helgaas 已提交
348 349
			pci_write_config_word(dev, pos + PCI_MSI_DATA_64,
					      msg->data);
350
		} else {
B
Bjorn Helgaas 已提交
351 352
			pci_write_config_word(dev, pos + PCI_MSI_DATA_32,
					      msg->data);
353
		}
L
Linus Torvalds 已提交
354
	}
355
	entry->msg = *msg;
L
Linus Torvalds 已提交
356
}
357

Y
Yinghai Lu 已提交
358 359
void write_msi_msg(unsigned int irq, struct msi_msg *msg)
{
360
	struct msi_desc *entry = irq_get_msi_desc(irq);
Y
Yinghai Lu 已提交
361

362
	__write_msi_msg(entry, msg);
Y
Yinghai Lu 已提交
363 364
}

365 366 367
static void free_msi_irqs(struct pci_dev *dev)
{
	struct msi_desc *entry, *tmp;
368 369 370
	struct attribute **msi_attrs;
	struct device_attribute *dev_attr;
	int count = 0;
371 372 373 374 375

	list_for_each_entry(entry, &dev->msi_list, list) {
		int i, nvec;
		if (!entry->irq)
			continue;
376 377 378 379
		if (entry->nvec_used)
			nvec = entry->nvec_used;
		else
			nvec = 1 << entry->msi_attrib.multiple;
380 381 382 383 384 385 386 387 388 389 390
		for (i = 0; i < nvec; i++)
			BUG_ON(irq_has_action(entry->irq + i));
	}

	arch_teardown_msi_irqs(dev);

	list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
		if (entry->msi_attrib.is_msix) {
			if (list_is_last(&entry->list, &dev->msi_list))
				iounmap(entry->mask_base);
		}
391 392 393 394 395 396 397 398 399 400 401 402

		/*
		 * Its possible that we get into this path
		 * When populate_msi_sysfs fails, which means the entries
		 * were not registered with sysfs.  In that case don't
		 * unregister them.
		 */
		if (entry->kobj.parent) {
			kobject_del(&entry->kobj);
			kobject_put(&entry->kobj);
		}

403 404 405
		list_del(&entry->list);
		kfree(entry);
	}
406 407 408 409

	if (dev->msi_irq_groups) {
		sysfs_remove_groups(&dev->dev.kobj, dev->msi_irq_groups);
		msi_attrs = dev->msi_irq_groups[0]->attrs;
410
		while (msi_attrs[count]) {
411 412 413 414 415 416 417 418 419 420 421
			dev_attr = container_of(msi_attrs[count],
						struct device_attribute, attr);
			kfree(dev_attr->attr.name);
			kfree(dev_attr);
			++count;
		}
		kfree(msi_attrs);
		kfree(dev->msi_irq_groups[0]);
		kfree(dev->msi_irq_groups);
		dev->msi_irq_groups = NULL;
	}
422
}
S
Satoru Takeuchi 已提交
423

424
static struct msi_desc *alloc_msi_entry(struct pci_dev *dev)
L
Linus Torvalds 已提交
425
{
426 427
	struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
	if (!desc)
L
Linus Torvalds 已提交
428 429
		return NULL;

430 431
	INIT_LIST_HEAD(&desc->list);
	desc->dev = dev;
L
Linus Torvalds 已提交
432

433
	return desc;
L
Linus Torvalds 已提交
434 435
}

436 437 438 439 440 441
static void pci_intx_for_msi(struct pci_dev *dev, int enable)
{
	if (!(dev->dev_flags & PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG))
		pci_intx(dev, enable);
}

442
static void __pci_restore_msi_state(struct pci_dev *dev)
443 444
{
	u16 control;
445
	struct msi_desc *entry;
446

447 448 449
	if (!dev->msi_enabled)
		return;

450
	entry = irq_get_msi_desc(dev->irq);
451

452
	pci_intx_for_msi(dev, 0);
453
	msi_set_enable(dev, 0);
454
	arch_restore_msi_irqs(dev);
455

456
	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);
457
	msi_mask_irq(entry, msi_capable_mask(control), entry->masked);
458
	control &= ~PCI_MSI_FLAGS_QSIZE;
459
	control |= (entry->msi_attrib.multiple << 4) | PCI_MSI_FLAGS_ENABLE;
460
	pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
461 462 463
}

static void __pci_restore_msix_state(struct pci_dev *dev)
464 465 466
{
	struct msi_desc *entry;

E
Eric W. Biederman 已提交
467 468
	if (!dev->msix_enabled)
		return;
469
	BUG_ON(list_empty(&dev->msi_list));
H
Hidetoshi Seto 已提交
470
	entry = list_first_entry(&dev->msi_list, struct msi_desc, list);
E
Eric W. Biederman 已提交
471

472
	/* route the table */
473
	pci_intx_for_msi(dev, 0);
474 475
	msix_clear_and_set_ctrl(dev, 0,
				PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
476

477
	arch_restore_msi_irqs(dev);
478
	list_for_each_entry(entry, &dev->msi_list, list) {
479
		msix_mask_irq(entry, entry->masked);
480 481
	}

482
	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
483
}
484 485 486 487 488 489

void pci_restore_msi_state(struct pci_dev *dev)
{
	__pci_restore_msi_state(dev);
	__pci_restore_msix_state(dev);
}
490
EXPORT_SYMBOL_GPL(pci_restore_msi_state);
491

492
static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
493 494
			     char *buf)
{
495 496 497 498
	struct pci_dev *pdev = to_pci_dev(dev);
	struct msi_desc *entry;
	unsigned long irq;
	int retval;
499

500 501 502
	retval = kstrtoul(attr->attr.name, 10, &irq);
	if (retval)
		return retval;
503

504 505 506 507 508 509 510
	list_for_each_entry(entry, &pdev->msi_list, list) {
		if (entry->irq == irq) {
			return sprintf(buf, "%s\n",
				       entry->msi_attrib.is_msix ? "msix" : "msi");
		}
	}
	return -ENODEV;
511 512 513 514
}

static int populate_msi_sysfs(struct pci_dev *pdev)
{
515 516 517 518 519
	struct attribute **msi_attrs;
	struct attribute *msi_attr;
	struct device_attribute *msi_dev_attr;
	struct attribute_group *msi_irq_group;
	const struct attribute_group **msi_irq_groups;
520
	struct msi_desc *entry;
521 522
	int ret = -ENOMEM;
	int num_msi = 0;
523 524
	int count = 0;

525 526 527 528 529 530
	/* Determine how many msi entries we have */
	list_for_each_entry(entry, &pdev->msi_list, list) {
		++num_msi;
	}
	if (!num_msi)
		return 0;
531

532 533 534 535
	/* Dynamically create the MSI attributes for the PCI device */
	msi_attrs = kzalloc(sizeof(void *) * (num_msi + 1), GFP_KERNEL);
	if (!msi_attrs)
		return -ENOMEM;
536
	list_for_each_entry(entry, &pdev->msi_list, list) {
537
		msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
538
		if (!msi_dev_attr)
539
			goto error_attrs;
540
		msi_attrs[count] = &msi_dev_attr->attr;
541

542
		sysfs_attr_init(&msi_dev_attr->attr);
543 544 545 546
		msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d",
						    entry->irq);
		if (!msi_dev_attr->attr.name)
			goto error_attrs;
547 548 549
		msi_dev_attr->attr.mode = S_IRUGO;
		msi_dev_attr->show = msi_mode_show;
		++count;
550 551
	}

552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567
	msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
	if (!msi_irq_group)
		goto error_attrs;
	msi_irq_group->name = "msi_irqs";
	msi_irq_group->attrs = msi_attrs;

	msi_irq_groups = kzalloc(sizeof(void *) * 2, GFP_KERNEL);
	if (!msi_irq_groups)
		goto error_irq_group;
	msi_irq_groups[0] = msi_irq_group;

	ret = sysfs_create_groups(&pdev->dev.kobj, msi_irq_groups);
	if (ret)
		goto error_irq_groups;
	pdev->msi_irq_groups = msi_irq_groups;

568 569
	return 0;

570 571 572 573 574 575 576 577 578 579 580 581 582
error_irq_groups:
	kfree(msi_irq_groups);
error_irq_group:
	kfree(msi_irq_group);
error_attrs:
	count = 0;
	msi_attr = msi_attrs[count];
	while (msi_attr) {
		msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
		kfree(msi_attr->name);
		kfree(msi_dev_attr);
		++count;
		msi_attr = msi_attrs[count];
583
	}
584
	kfree(msi_attrs);
585 586 587
	return ret;
}

L
Linus Torvalds 已提交
588 589 590
/**
 * msi_capability_init - configure device's MSI capability structure
 * @dev: pointer to the pci_dev data structure of MSI device function
591
 * @nvec: number of interrupts to allocate
L
Linus Torvalds 已提交
592
 *
593 594 595 596 597 598 599
 * Setup the MSI capability structure of the device with the requested
 * number of interrupts.  A return value of zero indicates the successful
 * setup of an entry with the new MSI irq.  A negative return value indicates
 * an error, and a positive return value indicates the number of interrupts
 * which could have been allocated.
 */
static int msi_capability_init(struct pci_dev *dev, int nvec)
L
Linus Torvalds 已提交
600 601
{
	struct msi_desc *entry;
602
	int ret;
L
Linus Torvalds 已提交
603
	u16 control;
604
	unsigned mask;
L
Linus Torvalds 已提交
605

606
	msi_set_enable(dev, 0);	/* Disable MSI during set up */
607

608
	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);
L
Linus Torvalds 已提交
609
	/* MSI Entry Initialization */
610
	entry = alloc_msi_entry(dev);
611 612
	if (!entry)
		return -ENOMEM;
613

H
Hidetoshi Seto 已提交
614
	entry->msi_attrib.is_msix	= 0;
615
	entry->msi_attrib.is_64		= !!(control & PCI_MSI_FLAGS_64BIT);
H
Hidetoshi Seto 已提交
616
	entry->msi_attrib.entry_nr	= 0;
617
	entry->msi_attrib.maskbit	= !!(control & PCI_MSI_FLAGS_MASKBIT);
H
Hidetoshi Seto 已提交
618
	entry->msi_attrib.default_irq	= dev->irq;	/* Save IOAPIC IRQ */
619
	entry->msi_attrib.pos		= dev->msi_cap;
620

D
Dan Carpenter 已提交
621 622 623 624
	if (control & PCI_MSI_FLAGS_64BIT)
		entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64;
	else
		entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_32;
625 626 627 628 629 630
	/* All MSIs are unmasked by default, Mask them all */
	if (entry->msi_attrib.maskbit)
		pci_read_config_dword(dev, entry->mask_pos, &entry->masked);
	mask = msi_capable_mask(control);
	msi_mask_irq(entry, mask, mask);

631
	list_add_tail(&entry->list, &dev->msi_list);
632

L
Linus Torvalds 已提交
633
	/* Configure MSI capability structure */
634
	ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
635
	if (ret) {
636
		msi_mask_irq(entry, mask, ~mask);
637
		free_msi_irqs(dev);
638
		return ret;
639
	}
640

641 642 643 644 645 646 647
	ret = populate_msi_sysfs(dev);
	if (ret) {
		msi_mask_irq(entry, mask, ~mask);
		free_msi_irqs(dev);
		return ret;
	}

L
Linus Torvalds 已提交
648
	/* Set MSI enabled bits	 */
649
	pci_intx_for_msi(dev, 0);
650
	msi_set_enable(dev, 1);
651
	dev->msi_enabled = 1;
L
Linus Torvalds 已提交
652

653
	dev->irq = entry->irq;
L
Linus Torvalds 已提交
654 655 656
	return 0;
}

657
static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries)
658
{
659
	resource_size_t phys_addr;
660 661 662
	u32 table_offset;
	u8 bir;

663 664
	pci_read_config_dword(dev, dev->msix_cap + PCI_MSIX_TABLE,
			      &table_offset);
665 666
	bir = (u8)(table_offset & PCI_MSIX_TABLE_BIR);
	table_offset &= PCI_MSIX_TABLE_OFFSET;
667 668 669 670 671
	phys_addr = pci_resource_start(dev, bir) + table_offset;

	return ioremap_nocache(phys_addr, nr_entries * PCI_MSIX_ENTRY_SIZE);
}

672 673
static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
			      struct msix_entry *entries, int nvec)
674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692
{
	struct msi_desc *entry;
	int i;

	for (i = 0; i < nvec; i++) {
		entry = alloc_msi_entry(dev);
		if (!entry) {
			if (!i)
				iounmap(base);
			else
				free_msi_irqs(dev);
			/* No enough memory. Don't try again */
			return -ENOMEM;
		}

		entry->msi_attrib.is_msix	= 1;
		entry->msi_attrib.is_64		= 1;
		entry->msi_attrib.entry_nr	= entries[i].entry;
		entry->msi_attrib.default_irq	= dev->irq;
693
		entry->msi_attrib.pos		= dev->msix_cap;
694 695 696 697 698 699 700 701
		entry->mask_base		= base;

		list_add_tail(&entry->list, &dev->msi_list);
	}

	return 0;
}

702
static void msix_program_entries(struct pci_dev *dev,
703
				 struct msix_entry *entries)
704 705 706 707 708 709 710 711 712
{
	struct msi_desc *entry;
	int i = 0;

	list_for_each_entry(entry, &dev->msi_list, list) {
		int offset = entries[i].entry * PCI_MSIX_ENTRY_SIZE +
						PCI_MSIX_ENTRY_VECTOR_CTRL;

		entries[i].vector = entry->irq;
713
		irq_set_msi_desc(entry->irq, entry);
714 715 716 717 718 719
		entry->masked = readl(entry->mask_base + offset);
		msix_mask_irq(entry, 1);
		i++;
	}
}

L
Linus Torvalds 已提交
720 721 722
/**
 * msix_capability_init - configure device's MSI-X capability
 * @dev: pointer to the pci_dev data structure of MSI-X device function
R
Randy Dunlap 已提交
723 724
 * @entries: pointer to an array of struct msix_entry entries
 * @nvec: number of @entries
L
Linus Torvalds 已提交
725
 *
726
 * Setup the MSI-X capability structure of device function with a
727 728
 * single MSI-X irq. A return of zero indicates the successful setup of
 * requested MSI-X entries with allocated irqs or non-zero for otherwise.
L
Linus Torvalds 已提交
729 730 731 732
 **/
static int msix_capability_init(struct pci_dev *dev,
				struct msix_entry *entries, int nvec)
{
733
	int ret;
734
	u16 control;
L
Linus Torvalds 已提交
735 736
	void __iomem *base;

737
	/* Ensure MSI-X is disabled while it is set up */
738
	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
739

740
	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
L
Linus Torvalds 已提交
741
	/* Request & Map MSI-X table region */
742
	base = msix_map_region(dev, msix_table_size(control));
743
	if (!base)
L
Linus Torvalds 已提交
744 745
		return -ENOMEM;

746
	ret = msix_setup_entries(dev, base, entries, nvec);
747 748
	if (ret)
		return ret;
749 750

	ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX);
751
	if (ret)
752
		goto out_avail;
753

754 755 756 757 758
	/*
	 * Some devices require MSI-X to be enabled before we can touch the
	 * MSI-X registers.  We need to mask all the vectors to prevent
	 * interrupts coming in before they're fully set up.
	 */
759 760
	msix_clear_and_set_ctrl(dev, 0,
				PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE);
761

762
	msix_program_entries(dev, entries);
763

764
	ret = populate_msi_sysfs(dev);
765 766
	if (ret)
		goto out_free;
767

768
	/* Set MSI-X enabled bits and unmask the function */
769
	pci_intx_for_msi(dev, 0);
770
	dev->msix_enabled = 1;
L
Linus Torvalds 已提交
771

772
	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
773

L
Linus Torvalds 已提交
774
	return 0;
775

776
out_avail:
777 778 779 780 781
	if (ret < 0) {
		/*
		 * If we had some success, report the number of irqs
		 * we succeeded in setting up.
		 */
782
		struct msi_desc *entry;
783 784 785 786 787 788 789 790 791 792
		int avail = 0;

		list_for_each_entry(entry, &dev->msi_list, list) {
			if (entry->irq != 0)
				avail++;
		}
		if (avail != 0)
			ret = avail;
	}

793
out_free:
794 795 796
	free_msi_irqs(dev);

	return ret;
L
Linus Torvalds 已提交
797 798
}

799
/**
800
 * pci_msi_check_device - check whether MSI may be enabled on a device
801
 * @dev: pointer to the pci_dev data structure of MSI device function
802
 * @nvec: how many MSIs have been requested ?
803
 * @type: are we checking for MSI or MSI-X ?
804
 *
805
 * Look at global flags, the device itself, and its parent buses
806 807
 * to determine if MSI/-X are supported for the device. If MSI/-X is
 * supported return 0, else return an error code.
808
 **/
H
Hidetoshi Seto 已提交
809
static int pci_msi_check_device(struct pci_dev *dev, int nvec, int type)
810 811
{
	struct pci_bus *bus;
812
	int ret;
813

814
	/* MSI must be globally enabled and supported by the device */
815 816 817
	if (!pci_msi_enable || !dev || dev->no_msi)
		return -EINVAL;

818 819 820 821 822 823 824 825
	/*
	 * You can't ask to have 0 or less MSIs configured.
	 *  a) it's stupid ..
	 *  b) the list manipulation code assumes nvec >= 1.
	 */
	if (nvec < 1)
		return -ERANGE;

H
Hidetoshi Seto 已提交
826 827 828
	/*
	 * Any bridge which does NOT route MSI transactions from its
	 * secondary bus to its primary bus must set NO_MSI flag on
829 830 831 832
	 * the secondary pci_bus.
	 * We expect only arch-specific PCI host bus controller driver
	 * or quirks for specific PCI bridges to be setting NO_MSI.
	 */
833 834 835 836
	for (bus = dev->bus; bus; bus = bus->parent)
		if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
			return -EINVAL;

837 838 839 840
	ret = arch_msi_check_device(dev, nvec, type);
	if (ret)
		return ret;

841 842 843
	return 0;
}

844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868
/**
 * pci_msi_vec_count - Return the number of MSI vectors a device can send
 * @dev: device to report about
 *
 * This function returns the number of MSI vectors a device requested via
 * Multiple Message Capable register. It returns a negative errno if the
 * device is not capable sending MSI interrupts. Otherwise, the call succeeds
 * and returns a power of two, up to a maximum of 2^5 (32), according to the
 * MSI specification.
 **/
int pci_msi_vec_count(struct pci_dev *dev)
{
	int ret;
	u16 msgctl;

	if (!dev->msi_cap)
		return -EINVAL;

	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &msgctl);
	ret = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1);

	return ret;
}
EXPORT_SYMBOL(pci_msi_vec_count);

869
void pci_msi_shutdown(struct pci_dev *dev)
L
Linus Torvalds 已提交
870
{
871 872 873
	struct msi_desc *desc;
	u32 mask;
	u16 ctrl;
L
Linus Torvalds 已提交
874

875
	if (!pci_msi_enable || !dev || !dev->msi_enabled)
E
Eric W. Biederman 已提交
876 877
		return;

878 879 880
	BUG_ON(list_empty(&dev->msi_list));
	desc = list_first_entry(&dev->msi_list, struct msi_desc, list);

881
	msi_set_enable(dev, 0);
882
	pci_intx_for_msi(dev, 1);
883
	dev->msi_enabled = 0;
884

885
	/* Return the device with MSI unmasked as initial states */
886
	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &ctrl);
887
	mask = msi_capable_mask(ctrl);
888
	/* Keep cached state to be restored */
889
	arch_msi_mask_irq(desc, mask, ~mask);
890 891

	/* Restore dev->irq to its default pin-assertion irq */
892
	dev->irq = desc->msi_attrib.default_irq;
893
}
894

H
Hidetoshi Seto 已提交
895
void pci_disable_msi(struct pci_dev *dev)
896 897 898 899 900
{
	if (!pci_msi_enable || !dev || !dev->msi_enabled)
		return;

	pci_msi_shutdown(dev);
901
	free_msi_irqs(dev);
L
Linus Torvalds 已提交
902
}
903
EXPORT_SYMBOL(pci_disable_msi);
L
Linus Torvalds 已提交
904

905
/**
906
 * pci_msix_vec_count - return the number of device's MSI-X table entries
907
 * @dev: pointer to the pci_dev data structure of MSI-X device function
908 909 910 911 912 913
 * This function returns the number of device's MSI-X table entries and
 * therefore the number of MSI-X vectors device is capable of sending.
 * It returns a negative errno if the device is not capable of sending MSI-X
 * interrupts.
 **/
int pci_msix_vec_count(struct pci_dev *dev)
914 915 916
{
	u16 control;

917
	if (!dev->msix_cap)
918
		return -EINVAL;
919

920
	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
921
	return msix_table_size(control);
922
}
923
EXPORT_SYMBOL(pci_msix_vec_count);
924

L
Linus Torvalds 已提交
925 926 927
/**
 * pci_enable_msix - configure device's MSI-X capability structure
 * @dev: pointer to the pci_dev data structure of MSI-X device function
928
 * @entries: pointer to an array of MSI-X entries
929
 * @nvec: number of MSI-X irqs requested for allocation by device driver
L
Linus Torvalds 已提交
930 931
 *
 * Setup the MSI-X capability structure of device function with the number
932
 * of requested irqs upon its software driver call to request for
L
Linus Torvalds 已提交
933 934
 * MSI-X mode enabled on its hardware device function. A return of zero
 * indicates the successful configuration of MSI-X capability structure
935
 * with new allocated MSI-X irqs. A return of < 0 indicates a failure.
L
Linus Torvalds 已提交
936
 * Or a return of > 0 indicates that driver request is exceeding the number
937 938
 * of irqs or MSI-X vectors available. Driver should use the returned value to
 * re-send its request.
L
Linus Torvalds 已提交
939
 **/
H
Hidetoshi Seto 已提交
940
int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
L
Linus Torvalds 已提交
941
{
942
	int status, nr_entries;
E
Eric W. Biederman 已提交
943
	int i, j;
L
Linus Torvalds 已提交
944

945
	if (!entries || !dev->msix_cap || dev->current_state != PCI_D0)
H
Hidetoshi Seto 已提交
946
		return -EINVAL;
L
Linus Torvalds 已提交
947

948 949 950 951
	status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSIX);
	if (status)
		return status;

952 953 954
	nr_entries = pci_msix_vec_count(dev);
	if (nr_entries < 0)
		return nr_entries;
L
Linus Torvalds 已提交
955
	if (nvec > nr_entries)
956
		return nr_entries;
L
Linus Torvalds 已提交
957 958 959 960 961 962 963 964 965 966

	/* Check for any invalid entries */
	for (i = 0; i < nvec; i++) {
		if (entries[i].entry >= nr_entries)
			return -EINVAL;		/* invalid entry */
		for (j = i + 1; j < nvec; j++) {
			if (entries[i].entry == entries[j].entry)
				return -EINVAL;	/* duplicate entry */
		}
	}
E
Eric W. Biederman 已提交
967
	WARN_ON(!!dev->msix_enabled);
968

969
	/* Check whether driver already requested for MSI irq */
H
Hidetoshi Seto 已提交
970
	if (dev->msi_enabled) {
971
		dev_info(&dev->dev, "can't enable MSI-X (MSI IRQ already assigned)\n");
L
Linus Torvalds 已提交
972 973 974 975 976
		return -EINVAL;
	}
	status = msix_capability_init(dev, entries, nvec);
	return status;
}
977
EXPORT_SYMBOL(pci_enable_msix);
L
Linus Torvalds 已提交
978

H
Hidetoshi Seto 已提交
979
void pci_msix_shutdown(struct pci_dev *dev)
980
{
981 982
	struct msi_desc *entry;

983
	if (!pci_msi_enable || !dev || !dev->msix_enabled)
E
Eric W. Biederman 已提交
984 985
		return;

986 987 988
	/* Return the device with MSI-X masked as initial states */
	list_for_each_entry(entry, &dev->msi_list, list) {
		/* Keep cached states to be restored */
989
		arch_msix_mask_irq(entry, 1);
990 991
	}

992
	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
993
	pci_intx_for_msi(dev, 1);
994
	dev->msix_enabled = 0;
995
}
996

H
Hidetoshi Seto 已提交
997
void pci_disable_msix(struct pci_dev *dev)
998 999 1000 1001 1002
{
	if (!pci_msi_enable || !dev || !dev->msix_enabled)
		return;

	pci_msix_shutdown(dev);
1003
	free_msi_irqs(dev);
L
Linus Torvalds 已提交
1004
}
1005
EXPORT_SYMBOL(pci_disable_msix);
L
Linus Torvalds 已提交
1006 1007

/**
1008
 * msi_remove_pci_irq_vectors - reclaim MSI(X) irqs to unused state
L
Linus Torvalds 已提交
1009 1010
 * @dev: pointer to the pci_dev data structure of MSI(X) device function
 *
1011
 * Being called during hotplug remove, from which the device function
1012
 * is hot-removed. All previous assigned MSI/MSI-X irqs, if
L
Linus Torvalds 已提交
1013 1014 1015
 * allocated for this device function, are reclaimed to unused state,
 * which may be used later on.
 **/
H
Hidetoshi Seto 已提交
1016
void msi_remove_pci_irq_vectors(struct pci_dev *dev)
L
Linus Torvalds 已提交
1017 1018
{
	if (!pci_msi_enable || !dev)
H
Hidetoshi Seto 已提交
1019
		return;
L
Linus Torvalds 已提交
1020

1021 1022
	if (dev->msi_enabled || dev->msix_enabled)
		free_msi_irqs(dev);
L
Linus Torvalds 已提交
1023 1024
}

1025 1026 1027 1028
void pci_no_msi(void)
{
	pci_msi_enable = 0;
}
1029

1030 1031 1032 1033 1034 1035 1036
/**
 * pci_msi_enabled - is MSI enabled?
 *
 * Returns true if MSI has not been disabled by the command-line option
 * pci=nomsi.
 **/
int pci_msi_enabled(void)
1037
{
1038
	return pci_msi_enable;
1039
}
1040
EXPORT_SYMBOL(pci_msi_enabled);
1041

1042
void pci_msi_init_pci_dev(struct pci_dev *dev)
1043
{
1044
	INIT_LIST_HEAD(&dev->msi_list);
1045 1046 1047 1048 1049

	/* Disable the msi hardware to avoid screaming interrupts
	 * during boot.  This is the power on reset default so
	 * usually this should be a noop.
	 */
1050 1051 1052 1053 1054 1055
	dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
	if (dev->msi_cap)
		msi_set_enable(dev, 0);

	dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
	if (dev->msix_cap)
1056
		msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
1057
}
1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072

/**
 * pci_enable_msi_range - configure device's MSI capability structure
 * @dev: device to configure
 * @minvec: minimal number of interrupts to configure
 * @maxvec: maximum number of interrupts to configure
 *
 * This function tries to allocate a maximum possible number of interrupts in a
 * range between @minvec and @maxvec. It returns a negative errno if an error
 * occurs. If it succeeds, it returns the actual number of interrupts allocated
 * and updates the @dev's irq member to the lowest new interrupt number;
 * the other interrupt numbers allocated to this device are consecutive.
 **/
int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
{
1073
	int nvec;
1074 1075
	int rc;

1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087
	if (dev->current_state != PCI_D0)
		return -EINVAL;

	WARN_ON(!!dev->msi_enabled);

	/* Check whether driver already requested MSI-X irqs */
	if (dev->msix_enabled) {
		dev_info(&dev->dev,
			 "can't enable MSI (MSI-X already enabled)\n");
		return -EINVAL;
	}

1088 1089 1090
	if (maxvec < minvec)
		return -ERANGE;

1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
	nvec = pci_msi_vec_count(dev);
	if (nvec < 0)
		return nvec;
	else if (nvec < minvec)
		return -EINVAL;
	else if (nvec > maxvec)
		nvec = maxvec;

	do {
		rc = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI);
		if (rc < 0) {
			return rc;
		} else if (rc > 0) {
			if (rc < minvec)
				return -ENOSPC;
			nvec = rc;
		}
	} while (rc);

1110
	do {
1111
		rc = msi_capability_init(dev, nvec);
1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162
		if (rc < 0) {
			return rc;
		} else if (rc > 0) {
			if (rc < minvec)
				return -ENOSPC;
			nvec = rc;
		}
	} while (rc);

	return nvec;
}
EXPORT_SYMBOL(pci_enable_msi_range);

/**
 * pci_enable_msix_range - configure device's MSI-X capability structure
 * @dev: pointer to the pci_dev data structure of MSI-X device function
 * @entries: pointer to an array of MSI-X entries
 * @minvec: minimum number of MSI-X irqs requested
 * @maxvec: maximum number of MSI-X irqs requested
 *
 * Setup the MSI-X capability structure of device function with a maximum
 * possible number of interrupts in the range between @minvec and @maxvec
 * upon its software driver call to request for MSI-X mode enabled on its
 * hardware device function. It returns a negative errno if an error occurs.
 * If it succeeds, it returns the actual number of interrupts allocated and
 * indicates the successful configuration of MSI-X capability structure
 * with new allocated MSI-X interrupts.
 **/
int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
			       int minvec, int maxvec)
{
	int nvec = maxvec;
	int rc;

	if (maxvec < minvec)
		return -ERANGE;

	do {
		rc = pci_enable_msix(dev, entries, nvec);
		if (rc < 0) {
			return rc;
		} else if (rc > 0) {
			if (rc < minvec)
				return -ENOSPC;
			nvec = rc;
		}
	} while (rc);

	return nvec;
}
EXPORT_SYMBOL(pci_enable_msix_range);