msi.c 27.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8
/*
 * File:	msi.c
 * Purpose:	PCI Message Signaled Interrupt (MSI)
 *
 * Copyright (C) 2003-2004 Intel
 * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
 */

9
#include <linux/err.h>
L
Linus Torvalds 已提交
10 11 12
#include <linux/mm.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
13
#include <linux/export.h>
L
Linus Torvalds 已提交
14 15 16
#include <linux/ioport.h>
#include <linux/pci.h>
#include <linux/proc_fs.h>
17
#include <linux/msi.h>
D
Dan Williams 已提交
18
#include <linux/smp.h>
H
Hidetoshi Seto 已提交
19 20
#include <linux/errno.h>
#include <linux/io.h>
21
#include <linux/slab.h>
L
Linus Torvalds 已提交
22 23 24 25 26

#include "pci.h"

static int pci_msi_enable = 1;

27 28 29
#define msix_table_size(flags)	((flags & PCI_MSIX_FLAGS_QSIZE) + 1)


30 31
/* Arch hooks */

32 33
int __weak arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
{
34 35 36 37 38 39 40 41 42 43 44 45 46
	struct msi_chip *chip = dev->bus->msi;
	int err;

	if (!chip || !chip->setup_irq)
		return -EINVAL;

	err = chip->setup_irq(chip, dev, desc);
	if (err < 0)
		return err;

	irq_set_chip_data(desc->irq, chip);

	return 0;
47 48 49
}

void __weak arch_teardown_msi_irq(unsigned int irq)
50
{
51 52 53 54 55 56
	struct msi_chip *chip = irq_get_chip_data(irq);

	if (!chip || !chip->teardown_irq)
		return;

	chip->teardown_irq(chip, irq);
57 58
}

59
int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
60 61 62 63
{
	struct msi_desc *entry;
	int ret;

64 65 66 67 68 69 70
	/*
	 * If an architecture wants to support multiple MSI, it needs to
	 * override arch_setup_msi_irqs()
	 */
	if (type == PCI_CAP_ID_MSI && nvec > 1)
		return 1;

71 72
	list_for_each_entry(entry, &dev->msi_list, list) {
		ret = arch_setup_msi_irq(dev, entry);
73
		if (ret < 0)
74
			return ret;
75 76
		if (ret > 0)
			return -ENOSPC;
77 78 79 80
	}

	return 0;
}
81

82 83 84 85
/*
 * We have a default implementation available as a separate non-weak
 * function, as it is used by the Xen x86 PCI code
 */
86
void default_teardown_msi_irqs(struct pci_dev *dev)
87 88 89 90
{
	struct msi_desc *entry;

	list_for_each_entry(entry, &dev->msi_list, list) {
91 92 93
		int i, nvec;
		if (entry->irq == 0)
			continue;
94 95 96 97
		if (entry->nvec_used)
			nvec = entry->nvec_used;
		else
			nvec = 1 << entry->msi_attrib.multiple;
98 99
		for (i = 0; i < nvec; i++)
			arch_teardown_msi_irq(entry->irq + i);
100 101 102
	}
}

103 104 105 106
void __weak arch_teardown_msi_irqs(struct pci_dev *dev)
{
	return default_teardown_msi_irqs(dev);
}
107

108
static void default_restore_msi_irq(struct pci_dev *dev, int irq)
109 110 111 112 113 114 115 116 117 118 119 120 121 122
{
	struct msi_desc *entry;

	entry = NULL;
	if (dev->msix_enabled) {
		list_for_each_entry(entry, &dev->msi_list, list) {
			if (irq == entry->irq)
				break;
		}
	} else if (dev->msi_enabled)  {
		entry = irq_get_msi_desc(irq);
	}

	if (entry)
123
		__write_msi_msg(entry, &entry->msg);
124
}
125

126
void __weak arch_restore_msi_irqs(struct pci_dev *dev)
127
{
128
	return default_restore_msi_irqs(dev);
129
}
130

131
static void msi_set_enable(struct pci_dev *dev, int enable)
132 133 134
{
	u16 control;

135
	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);
136 137 138
	control &= ~PCI_MSI_FLAGS_ENABLE;
	if (enable)
		control |= PCI_MSI_FLAGS_ENABLE;
139
	pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
140 141
}

142
static void msix_clear_and_set_ctrl(struct pci_dev *dev, u16 clear, u16 set)
143
{
144
	u16 ctrl;
145

146 147 148 149
	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
	ctrl &= ~clear;
	ctrl |= set;
	pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, ctrl);
150 151
}

152 153
static inline __attribute_const__ u32 msi_mask(unsigned x)
{
154 155 156 157
	/* Don't shift by >= width of type */
	if (x >= 5)
		return 0xffffffff;
	return (1 << (1 << x)) - 1;
158 159
}

160 161 162 163 164 165
/*
 * PCI 2.3 does not specify mask bits for each MSI interrupt.  Attempting to
 * mask all MSI interrupts by clearing the MSI enable bit does not work
 * reliably as devices without an INTx disable bit will then generate a
 * level IRQ which will never be cleared.
 */
166
u32 default_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
L
Linus Torvalds 已提交
167
{
168
	u32 mask_bits = desc->masked;
L
Linus Torvalds 已提交
169

170
	if (!desc->msi_attrib.maskbit)
171
		return 0;
172 173 174 175

	mask_bits &= ~mask;
	mask_bits |= flag;
	pci_write_config_dword(desc->dev, desc->mask_pos, mask_bits);
176 177 178 179

	return mask_bits;
}

180 181 182 183 184
__weak u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
{
	return default_msi_mask_irq(desc, mask, flag);
}

185 186
static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
{
187
	desc->masked = arch_msi_mask_irq(desc, mask, flag);
188 189 190 191 192 193 194 195 196
}

/*
 * This internal function does not flush PCI writes to the device.
 * All users must ensure that they read from the device before either
 * assuming that the device state is up to date, or returning out of this
 * file.  This saves a few milliseconds when initialising devices with lots
 * of MSI-X interrupts.
 */
197
u32 default_msix_mask_irq(struct msi_desc *desc, u32 flag)
198 199 200
{
	u32 mask_bits = desc->masked;
	unsigned offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
201
						PCI_MSIX_ENTRY_VECTOR_CTRL;
202 203 204
	mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
	if (flag)
		mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
205
	writel(mask_bits, desc->mask_base + offset);
206 207 208 209

	return mask_bits;
}

210 211 212 213 214
__weak u32 arch_msix_mask_irq(struct msi_desc *desc, u32 flag)
{
	return default_msix_mask_irq(desc, flag);
}

215 216
static void msix_mask_irq(struct msi_desc *desc, u32 flag)
{
217
	desc->masked = arch_msix_mask_irq(desc, flag);
218
}
219

220
static void msi_set_mask_bit(struct irq_data *data, u32 flag)
221
{
222
	struct msi_desc *desc = irq_data_get_msi(data);
223

224 225 226 227
	if (desc->msi_attrib.is_msix) {
		msix_mask_irq(desc, flag);
		readl(desc->mask_base);		/* Flush write to device */
	} else {
228
		unsigned offset = data->irq - desc->irq;
229
		msi_mask_irq(desc, 1 << offset, flag << offset);
L
Linus Torvalds 已提交
230
	}
231 232
}

233
void mask_msi_irq(struct irq_data *data)
234
{
235
	msi_set_mask_bit(data, 1);
236 237
}

238
void unmask_msi_irq(struct irq_data *data)
239
{
240
	msi_set_mask_bit(data, 0);
L
Linus Torvalds 已提交
241 242
}

243 244 245 246 247 248 249 250 251
void default_restore_msi_irqs(struct pci_dev *dev)
{
	struct msi_desc *entry;

	list_for_each_entry(entry, &dev->msi_list, list) {
		default_restore_msi_irq(dev, entry->irq);
	}
}

252
void __read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
L
Linus Torvalds 已提交
253
{
254 255 256 257 258 259 260 261 262 263 264
	BUG_ON(entry->dev->current_state != PCI_D0);

	if (entry->msi_attrib.is_msix) {
		void __iomem *base = entry->mask_base +
			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;

		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
		msg->data = readl(base + PCI_MSIX_ENTRY_DATA);
	} else {
		struct pci_dev *dev = entry->dev;
265
		int pos = dev->msi_cap;
266 267
		u16 data;

268 269
		pci_read_config_dword(dev, pos + PCI_MSI_ADDRESS_LO,
				      &msg->address_lo);
270
		if (entry->msi_attrib.is_64) {
271 272
			pci_read_config_dword(dev, pos + PCI_MSI_ADDRESS_HI,
					      &msg->address_hi);
B
Bjorn Helgaas 已提交
273
			pci_read_config_word(dev, pos + PCI_MSI_DATA_64, &data);
274 275
		} else {
			msg->address_hi = 0;
B
Bjorn Helgaas 已提交
276
			pci_read_config_word(dev, pos + PCI_MSI_DATA_32, &data);
277 278 279 280 281 282 283
		}
		msg->data = data;
	}
}

void read_msi_msg(unsigned int irq, struct msi_msg *msg)
{
284
	struct msi_desc *entry = irq_get_msi_desc(irq);
285

286
	__read_msi_msg(entry, msg);
287 288
}

289
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
290 291
{
	/* Assert that the cache is valid, assuming that
292 293 294
	 * valid messages are not all-zeroes. */
	BUG_ON(!(entry->msg.address_hi | entry->msg.address_lo |
		 entry->msg.data));
295

296
	*msg = entry->msg;
297
}
L
Linus Torvalds 已提交
298

299
void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
300
{
301
	struct msi_desc *entry = irq_get_msi_desc(irq);
Y
Yinghai Lu 已提交
302

303
	__get_cached_msi_msg(entry, msg);
Y
Yinghai Lu 已提交
304
}
305
EXPORT_SYMBOL_GPL(get_cached_msi_msg);
Y
Yinghai Lu 已提交
306

307
void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
Y
Yinghai Lu 已提交
308
{
309 310 311
	if (entry->dev->current_state != PCI_D0) {
		/* Don't touch the hardware now */
	} else if (entry->msi_attrib.is_msix) {
312 313 314 315
		void __iomem *base;
		base = entry->mask_base +
			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;

316 317 318
		writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
		writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
		writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
319
	} else {
320
		struct pci_dev *dev = entry->dev;
321
		int pos = dev->msi_cap;
322 323
		u16 msgctl;

324
		pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
325 326
		msgctl &= ~PCI_MSI_FLAGS_QSIZE;
		msgctl |= entry->msi_attrib.multiple << 4;
327
		pci_write_config_word(dev, pos + PCI_MSI_FLAGS, msgctl);
328

329 330
		pci_write_config_dword(dev, pos + PCI_MSI_ADDRESS_LO,
				       msg->address_lo);
331
		if (entry->msi_attrib.is_64) {
332 333
			pci_write_config_dword(dev, pos + PCI_MSI_ADDRESS_HI,
					       msg->address_hi);
B
Bjorn Helgaas 已提交
334 335
			pci_write_config_word(dev, pos + PCI_MSI_DATA_64,
					      msg->data);
336
		} else {
B
Bjorn Helgaas 已提交
337 338
			pci_write_config_word(dev, pos + PCI_MSI_DATA_32,
					      msg->data);
339
		}
L
Linus Torvalds 已提交
340
	}
341
	entry->msg = *msg;
L
Linus Torvalds 已提交
342
}
343

Y
Yinghai Lu 已提交
344 345
void write_msi_msg(unsigned int irq, struct msi_msg *msg)
{
346
	struct msi_desc *entry = irq_get_msi_desc(irq);
Y
Yinghai Lu 已提交
347

348
	__write_msi_msg(entry, msg);
Y
Yinghai Lu 已提交
349
}
350
EXPORT_SYMBOL_GPL(write_msi_msg);
Y
Yinghai Lu 已提交
351

352 353 354
static void free_msi_irqs(struct pci_dev *dev)
{
	struct msi_desc *entry, *tmp;
355 356 357
	struct attribute **msi_attrs;
	struct device_attribute *dev_attr;
	int count = 0;
358 359 360 361 362

	list_for_each_entry(entry, &dev->msi_list, list) {
		int i, nvec;
		if (!entry->irq)
			continue;
363 364 365 366
		if (entry->nvec_used)
			nvec = entry->nvec_used;
		else
			nvec = 1 << entry->msi_attrib.multiple;
367 368 369 370 371 372 373 374 375 376 377
		for (i = 0; i < nvec; i++)
			BUG_ON(irq_has_action(entry->irq + i));
	}

	arch_teardown_msi_irqs(dev);

	list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
		if (entry->msi_attrib.is_msix) {
			if (list_is_last(&entry->list, &dev->msi_list))
				iounmap(entry->mask_base);
		}
378

379 380 381
		list_del(&entry->list);
		kfree(entry);
	}
382 383 384 385

	if (dev->msi_irq_groups) {
		sysfs_remove_groups(&dev->dev.kobj, dev->msi_irq_groups);
		msi_attrs = dev->msi_irq_groups[0]->attrs;
386
		while (msi_attrs[count]) {
387 388 389 390 391 392 393 394 395 396 397
			dev_attr = container_of(msi_attrs[count],
						struct device_attribute, attr);
			kfree(dev_attr->attr.name);
			kfree(dev_attr);
			++count;
		}
		kfree(msi_attrs);
		kfree(dev->msi_irq_groups[0]);
		kfree(dev->msi_irq_groups);
		dev->msi_irq_groups = NULL;
	}
398
}
S
Satoru Takeuchi 已提交
399

400
static struct msi_desc *alloc_msi_entry(struct pci_dev *dev)
L
Linus Torvalds 已提交
401
{
402 403
	struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
	if (!desc)
L
Linus Torvalds 已提交
404 405
		return NULL;

406 407
	INIT_LIST_HEAD(&desc->list);
	desc->dev = dev;
L
Linus Torvalds 已提交
408

409
	return desc;
L
Linus Torvalds 已提交
410 411
}

412 413 414 415 416 417
static void pci_intx_for_msi(struct pci_dev *dev, int enable)
{
	if (!(dev->dev_flags & PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG))
		pci_intx(dev, enable);
}

418
static void __pci_restore_msi_state(struct pci_dev *dev)
419 420
{
	u16 control;
421
	struct msi_desc *entry;
422

423 424 425
	if (!dev->msi_enabled)
		return;

426
	entry = irq_get_msi_desc(dev->irq);
427

428
	pci_intx_for_msi(dev, 0);
429
	msi_set_enable(dev, 0);
430
	arch_restore_msi_irqs(dev);
431

432
	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);
433 434
	msi_mask_irq(entry, msi_mask(entry->msi_attrib.multi_cap),
		     entry->masked);
435
	control &= ~PCI_MSI_FLAGS_QSIZE;
436
	control |= (entry->msi_attrib.multiple << 4) | PCI_MSI_FLAGS_ENABLE;
437
	pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
438 439 440
}

static void __pci_restore_msix_state(struct pci_dev *dev)
441 442 443
{
	struct msi_desc *entry;

E
Eric W. Biederman 已提交
444 445
	if (!dev->msix_enabled)
		return;
446
	BUG_ON(list_empty(&dev->msi_list));
E
Eric W. Biederman 已提交
447

448
	/* route the table */
449
	pci_intx_for_msi(dev, 0);
450 451
	msix_clear_and_set_ctrl(dev, 0,
				PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
452

453
	arch_restore_msi_irqs(dev);
454
	list_for_each_entry(entry, &dev->msi_list, list) {
455
		msix_mask_irq(entry, entry->masked);
456 457
	}

458
	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
459
}
460 461 462 463 464 465

void pci_restore_msi_state(struct pci_dev *dev)
{
	__pci_restore_msi_state(dev);
	__pci_restore_msix_state(dev);
}
466
EXPORT_SYMBOL_GPL(pci_restore_msi_state);
467

468
static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
469 470
			     char *buf)
{
471 472 473
	struct msi_desc *entry;
	unsigned long irq;
	int retval;
474

475 476 477
	retval = kstrtoul(attr->attr.name, 10, &irq);
	if (retval)
		return retval;
478

479 480 481 482 483
	entry = irq_get_msi_desc(irq);
	if (entry)
		return sprintf(buf, "%s\n",
				entry->msi_attrib.is_msix ? "msix" : "msi");

484
	return -ENODEV;
485 486 487 488
}

static int populate_msi_sysfs(struct pci_dev *pdev)
{
489 490 491 492 493
	struct attribute **msi_attrs;
	struct attribute *msi_attr;
	struct device_attribute *msi_dev_attr;
	struct attribute_group *msi_irq_group;
	const struct attribute_group **msi_irq_groups;
494
	struct msi_desc *entry;
495 496
	int ret = -ENOMEM;
	int num_msi = 0;
497 498
	int count = 0;

499 500 501 502 503 504
	/* Determine how many msi entries we have */
	list_for_each_entry(entry, &pdev->msi_list, list) {
		++num_msi;
	}
	if (!num_msi)
		return 0;
505

506 507 508 509
	/* Dynamically create the MSI attributes for the PCI device */
	msi_attrs = kzalloc(sizeof(void *) * (num_msi + 1), GFP_KERNEL);
	if (!msi_attrs)
		return -ENOMEM;
510
	list_for_each_entry(entry, &pdev->msi_list, list) {
511
		msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
512
		if (!msi_dev_attr)
513
			goto error_attrs;
514
		msi_attrs[count] = &msi_dev_attr->attr;
515

516
		sysfs_attr_init(&msi_dev_attr->attr);
517 518 519 520
		msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d",
						    entry->irq);
		if (!msi_dev_attr->attr.name)
			goto error_attrs;
521 522 523
		msi_dev_attr->attr.mode = S_IRUGO;
		msi_dev_attr->show = msi_mode_show;
		++count;
524 525
	}

526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541
	msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
	if (!msi_irq_group)
		goto error_attrs;
	msi_irq_group->name = "msi_irqs";
	msi_irq_group->attrs = msi_attrs;

	msi_irq_groups = kzalloc(sizeof(void *) * 2, GFP_KERNEL);
	if (!msi_irq_groups)
		goto error_irq_group;
	msi_irq_groups[0] = msi_irq_group;

	ret = sysfs_create_groups(&pdev->dev.kobj, msi_irq_groups);
	if (ret)
		goto error_irq_groups;
	pdev->msi_irq_groups = msi_irq_groups;

542 543
	return 0;

544 545 546 547 548 549 550 551 552 553 554 555 556
error_irq_groups:
	kfree(msi_irq_groups);
error_irq_group:
	kfree(msi_irq_group);
error_attrs:
	count = 0;
	msi_attr = msi_attrs[count];
	while (msi_attr) {
		msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
		kfree(msi_attr->name);
		kfree(msi_dev_attr);
		++count;
		msi_attr = msi_attrs[count];
557
	}
558
	kfree(msi_attrs);
559 560 561
	return ret;
}

562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
static struct msi_desc *msi_setup_entry(struct pci_dev *dev)
{
	u16 control;
	struct msi_desc *entry;

	/* MSI Entry Initialization */
	entry = alloc_msi_entry(dev);
	if (!entry)
		return NULL;

	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);

	entry->msi_attrib.is_msix	= 0;
	entry->msi_attrib.is_64		= !!(control & PCI_MSI_FLAGS_64BIT);
	entry->msi_attrib.entry_nr	= 0;
	entry->msi_attrib.maskbit	= !!(control & PCI_MSI_FLAGS_MASKBIT);
	entry->msi_attrib.default_irq	= dev->irq;	/* Save IOAPIC IRQ */
	entry->msi_attrib.multi_cap	= (control & PCI_MSI_FLAGS_QMASK) >> 1;

	if (control & PCI_MSI_FLAGS_64BIT)
		entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64;
	else
		entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_32;

	/* Save the initial mask status */
	if (entry->msi_attrib.maskbit)
		pci_read_config_dword(dev, entry->mask_pos, &entry->masked);

	return entry;
}

L
Linus Torvalds 已提交
593 594 595
/**
 * msi_capability_init - configure device's MSI capability structure
 * @dev: pointer to the pci_dev data structure of MSI device function
596
 * @nvec: number of interrupts to allocate
L
Linus Torvalds 已提交
597
 *
598 599 600 601 602 603 604
 * Setup the MSI capability structure of the device with the requested
 * number of interrupts.  A return value of zero indicates the successful
 * setup of an entry with the new MSI irq.  A negative return value indicates
 * an error, and a positive return value indicates the number of interrupts
 * which could have been allocated.
 */
static int msi_capability_init(struct pci_dev *dev, int nvec)
L
Linus Torvalds 已提交
605 606
{
	struct msi_desc *entry;
607
	int ret;
608
	unsigned mask;
L
Linus Torvalds 已提交
609

610
	msi_set_enable(dev, 0);	/* Disable MSI during set up */
611

612
	entry = msi_setup_entry(dev);
613 614
	if (!entry)
		return -ENOMEM;
615

616
	/* All MSIs are unmasked by default, Mask them all */
617
	mask = msi_mask(entry->msi_attrib.multi_cap);
618 619
	msi_mask_irq(entry, mask, mask);

620
	list_add_tail(&entry->list, &dev->msi_list);
621

L
Linus Torvalds 已提交
622
	/* Configure MSI capability structure */
623
	ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
624
	if (ret) {
625
		msi_mask_irq(entry, mask, ~mask);
626
		free_msi_irqs(dev);
627
		return ret;
628
	}
629

630 631 632 633 634 635 636
	ret = populate_msi_sysfs(dev);
	if (ret) {
		msi_mask_irq(entry, mask, ~mask);
		free_msi_irqs(dev);
		return ret;
	}

L
Linus Torvalds 已提交
637
	/* Set MSI enabled bits	 */
638
	pci_intx_for_msi(dev, 0);
639
	msi_set_enable(dev, 1);
640
	dev->msi_enabled = 1;
L
Linus Torvalds 已提交
641

642
	dev->irq = entry->irq;
L
Linus Torvalds 已提交
643 644 645
	return 0;
}

646
static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries)
647
{
648
	resource_size_t phys_addr;
649 650 651
	u32 table_offset;
	u8 bir;

652 653
	pci_read_config_dword(dev, dev->msix_cap + PCI_MSIX_TABLE,
			      &table_offset);
654 655
	bir = (u8)(table_offset & PCI_MSIX_TABLE_BIR);
	table_offset &= PCI_MSIX_TABLE_OFFSET;
656 657 658 659 660
	phys_addr = pci_resource_start(dev, bir) + table_offset;

	return ioremap_nocache(phys_addr, nr_entries * PCI_MSIX_ENTRY_SIZE);
}

661 662
static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
			      struct msix_entry *entries, int nvec)
663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689
{
	struct msi_desc *entry;
	int i;

	for (i = 0; i < nvec; i++) {
		entry = alloc_msi_entry(dev);
		if (!entry) {
			if (!i)
				iounmap(base);
			else
				free_msi_irqs(dev);
			/* No enough memory. Don't try again */
			return -ENOMEM;
		}

		entry->msi_attrib.is_msix	= 1;
		entry->msi_attrib.is_64		= 1;
		entry->msi_attrib.entry_nr	= entries[i].entry;
		entry->msi_attrib.default_irq	= dev->irq;
		entry->mask_base		= base;

		list_add_tail(&entry->list, &dev->msi_list);
	}

	return 0;
}

690
static void msix_program_entries(struct pci_dev *dev,
691
				 struct msix_entry *entries)
692 693 694 695 696 697 698 699 700
{
	struct msi_desc *entry;
	int i = 0;

	list_for_each_entry(entry, &dev->msi_list, list) {
		int offset = entries[i].entry * PCI_MSIX_ENTRY_SIZE +
						PCI_MSIX_ENTRY_VECTOR_CTRL;

		entries[i].vector = entry->irq;
701
		irq_set_msi_desc(entry->irq, entry);
702 703 704 705 706 707
		entry->masked = readl(entry->mask_base + offset);
		msix_mask_irq(entry, 1);
		i++;
	}
}

L
Linus Torvalds 已提交
708 709 710
/**
 * msix_capability_init - configure device's MSI-X capability
 * @dev: pointer to the pci_dev data structure of MSI-X device function
R
Randy Dunlap 已提交
711 712
 * @entries: pointer to an array of struct msix_entry entries
 * @nvec: number of @entries
L
Linus Torvalds 已提交
713
 *
714
 * Setup the MSI-X capability structure of device function with a
715 716
 * single MSI-X irq. A return of zero indicates the successful setup of
 * requested MSI-X entries with allocated irqs or non-zero for otherwise.
L
Linus Torvalds 已提交
717 718 719 720
 **/
static int msix_capability_init(struct pci_dev *dev,
				struct msix_entry *entries, int nvec)
{
721
	int ret;
722
	u16 control;
L
Linus Torvalds 已提交
723 724
	void __iomem *base;

725
	/* Ensure MSI-X is disabled while it is set up */
726
	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
727

728
	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
L
Linus Torvalds 已提交
729
	/* Request & Map MSI-X table region */
730
	base = msix_map_region(dev, msix_table_size(control));
731
	if (!base)
L
Linus Torvalds 已提交
732 733
		return -ENOMEM;

734
	ret = msix_setup_entries(dev, base, entries, nvec);
735 736
	if (ret)
		return ret;
737 738

	ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX);
739
	if (ret)
740
		goto out_avail;
741

742 743 744 745 746
	/*
	 * Some devices require MSI-X to be enabled before we can touch the
	 * MSI-X registers.  We need to mask all the vectors to prevent
	 * interrupts coming in before they're fully set up.
	 */
747 748
	msix_clear_and_set_ctrl(dev, 0,
				PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE);
749

750
	msix_program_entries(dev, entries);
751

752
	ret = populate_msi_sysfs(dev);
753 754
	if (ret)
		goto out_free;
755

756
	/* Set MSI-X enabled bits and unmask the function */
757
	pci_intx_for_msi(dev, 0);
758
	dev->msix_enabled = 1;
L
Linus Torvalds 已提交
759

760
	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
761

L
Linus Torvalds 已提交
762
	return 0;
763

764
out_avail:
765 766 767 768 769
	if (ret < 0) {
		/*
		 * If we had some success, report the number of irqs
		 * we succeeded in setting up.
		 */
770
		struct msi_desc *entry;
771 772 773 774 775 776 777 778 779 780
		int avail = 0;

		list_for_each_entry(entry, &dev->msi_list, list) {
			if (entry->irq != 0)
				avail++;
		}
		if (avail != 0)
			ret = avail;
	}

781
out_free:
782 783 784
	free_msi_irqs(dev);

	return ret;
L
Linus Torvalds 已提交
785 786
}

787
/**
788
 * pci_msi_supported - check whether MSI may be enabled on a device
789
 * @dev: pointer to the pci_dev data structure of MSI device function
790
 * @nvec: how many MSIs have been requested ?
791
 *
792
 * Look at global flags, the device itself, and its parent buses
793
 * to determine if MSI/-X are supported for the device. If MSI/-X is
794
 * supported return 1, else return 0.
795
 **/
796
static int pci_msi_supported(struct pci_dev *dev, int nvec)
797 798 799
{
	struct pci_bus *bus;

800
	/* MSI must be globally enabled and supported by the device */
801
	if (!pci_msi_enable)
802
		return 0;
803 804

	if (!dev || dev->no_msi || dev->current_state != PCI_D0)
805
		return 0;
806

807 808 809 810 811 812
	/*
	 * You can't ask to have 0 or less MSIs configured.
	 *  a) it's stupid ..
	 *  b) the list manipulation code assumes nvec >= 1.
	 */
	if (nvec < 1)
813
		return 0;
814

H
Hidetoshi Seto 已提交
815 816 817
	/*
	 * Any bridge which does NOT route MSI transactions from its
	 * secondary bus to its primary bus must set NO_MSI flag on
818 819 820 821
	 * the secondary pci_bus.
	 * We expect only arch-specific PCI host bus controller driver
	 * or quirks for specific PCI bridges to be setting NO_MSI.
	 */
822 823
	for (bus = dev->bus; bus; bus = bus->parent)
		if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
824
			return 0;
825

826
	return 1;
827 828
}

829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853
/**
 * pci_msi_vec_count - Return the number of MSI vectors a device can send
 * @dev: device to report about
 *
 * This function returns the number of MSI vectors a device requested via
 * Multiple Message Capable register. It returns a negative errno if the
 * device is not capable sending MSI interrupts. Otherwise, the call succeeds
 * and returns a power of two, up to a maximum of 2^5 (32), according to the
 * MSI specification.
 **/
int pci_msi_vec_count(struct pci_dev *dev)
{
	int ret;
	u16 msgctl;

	if (!dev->msi_cap)
		return -EINVAL;

	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &msgctl);
	ret = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1);

	return ret;
}
EXPORT_SYMBOL(pci_msi_vec_count);

854
void pci_msi_shutdown(struct pci_dev *dev)
L
Linus Torvalds 已提交
855
{
856 857
	struct msi_desc *desc;
	u32 mask;
L
Linus Torvalds 已提交
858

859
	if (!pci_msi_enable || !dev || !dev->msi_enabled)
E
Eric W. Biederman 已提交
860 861
		return;

862 863 864
	BUG_ON(list_empty(&dev->msi_list));
	desc = list_first_entry(&dev->msi_list, struct msi_desc, list);

865
	msi_set_enable(dev, 0);
866
	pci_intx_for_msi(dev, 1);
867
	dev->msi_enabled = 0;
868

869
	/* Return the device with MSI unmasked as initial states */
870
	mask = msi_mask(desc->msi_attrib.multi_cap);
871
	/* Keep cached state to be restored */
872
	arch_msi_mask_irq(desc, mask, ~mask);
873 874

	/* Restore dev->irq to its default pin-assertion irq */
875
	dev->irq = desc->msi_attrib.default_irq;
876
}
877

H
Hidetoshi Seto 已提交
878
void pci_disable_msi(struct pci_dev *dev)
879 880 881 882 883
{
	if (!pci_msi_enable || !dev || !dev->msi_enabled)
		return;

	pci_msi_shutdown(dev);
884
	free_msi_irqs(dev);
L
Linus Torvalds 已提交
885
}
886
EXPORT_SYMBOL(pci_disable_msi);
L
Linus Torvalds 已提交
887

888
/**
889
 * pci_msix_vec_count - return the number of device's MSI-X table entries
890
 * @dev: pointer to the pci_dev data structure of MSI-X device function
891 892 893 894 895 896
 * This function returns the number of device's MSI-X table entries and
 * therefore the number of MSI-X vectors device is capable of sending.
 * It returns a negative errno if the device is not capable of sending MSI-X
 * interrupts.
 **/
int pci_msix_vec_count(struct pci_dev *dev)
897 898 899
{
	u16 control;

900
	if (!dev->msix_cap)
901
		return -EINVAL;
902

903
	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
904
	return msix_table_size(control);
905
}
906
EXPORT_SYMBOL(pci_msix_vec_count);
907

L
Linus Torvalds 已提交
908 909 910
/**
 * pci_enable_msix - configure device's MSI-X capability structure
 * @dev: pointer to the pci_dev data structure of MSI-X device function
911
 * @entries: pointer to an array of MSI-X entries
912
 * @nvec: number of MSI-X irqs requested for allocation by device driver
L
Linus Torvalds 已提交
913 914
 *
 * Setup the MSI-X capability structure of device function with the number
915
 * of requested irqs upon its software driver call to request for
L
Linus Torvalds 已提交
916 917
 * MSI-X mode enabled on its hardware device function. A return of zero
 * indicates the successful configuration of MSI-X capability structure
918
 * with new allocated MSI-X irqs. A return of < 0 indicates a failure.
L
Linus Torvalds 已提交
919
 * Or a return of > 0 indicates that driver request is exceeding the number
920 921
 * of irqs or MSI-X vectors available. Driver should use the returned value to
 * re-send its request.
L
Linus Torvalds 已提交
922
 **/
H
Hidetoshi Seto 已提交
923
int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
L
Linus Torvalds 已提交
924
{
925
	int nr_entries;
E
Eric W. Biederman 已提交
926
	int i, j;
L
Linus Torvalds 已提交
927

928 929
	if (!pci_msi_supported(dev, nvec))
		return -EINVAL;
930

931 932 933
	if (!entries)
		return -EINVAL;

934 935 936
	nr_entries = pci_msix_vec_count(dev);
	if (nr_entries < 0)
		return nr_entries;
L
Linus Torvalds 已提交
937
	if (nvec > nr_entries)
938
		return nr_entries;
L
Linus Torvalds 已提交
939 940 941 942 943 944 945 946 947 948

	/* Check for any invalid entries */
	for (i = 0; i < nvec; i++) {
		if (entries[i].entry >= nr_entries)
			return -EINVAL;		/* invalid entry */
		for (j = i + 1; j < nvec; j++) {
			if (entries[i].entry == entries[j].entry)
				return -EINVAL;	/* duplicate entry */
		}
	}
E
Eric W. Biederman 已提交
949
	WARN_ON(!!dev->msix_enabled);
950

951
	/* Check whether driver already requested for MSI irq */
H
Hidetoshi Seto 已提交
952
	if (dev->msi_enabled) {
953
		dev_info(&dev->dev, "can't enable MSI-X (MSI IRQ already assigned)\n");
L
Linus Torvalds 已提交
954 955
		return -EINVAL;
	}
956
	return msix_capability_init(dev, entries, nvec);
L
Linus Torvalds 已提交
957
}
958
EXPORT_SYMBOL(pci_enable_msix);
L
Linus Torvalds 已提交
959

H
Hidetoshi Seto 已提交
960
void pci_msix_shutdown(struct pci_dev *dev)
961
{
962 963
	struct msi_desc *entry;

964
	if (!pci_msi_enable || !dev || !dev->msix_enabled)
E
Eric W. Biederman 已提交
965 966
		return;

967 968 969
	/* Return the device with MSI-X masked as initial states */
	list_for_each_entry(entry, &dev->msi_list, list) {
		/* Keep cached states to be restored */
970
		arch_msix_mask_irq(entry, 1);
971 972
	}

973
	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
974
	pci_intx_for_msi(dev, 1);
975
	dev->msix_enabled = 0;
976
}
977

H
Hidetoshi Seto 已提交
978
void pci_disable_msix(struct pci_dev *dev)
979 980 981 982 983
{
	if (!pci_msi_enable || !dev || !dev->msix_enabled)
		return;

	pci_msix_shutdown(dev);
984
	free_msi_irqs(dev);
L
Linus Torvalds 已提交
985
}
986
EXPORT_SYMBOL(pci_disable_msix);
L
Linus Torvalds 已提交
987

988 989 990 991
void pci_no_msi(void)
{
	pci_msi_enable = 0;
}
992

993 994 995 996 997 998 999
/**
 * pci_msi_enabled - is MSI enabled?
 *
 * Returns true if MSI has not been disabled by the command-line option
 * pci=nomsi.
 **/
int pci_msi_enabled(void)
1000
{
1001
	return pci_msi_enable;
1002
}
1003
EXPORT_SYMBOL(pci_msi_enabled);
1004

1005
void pci_msi_init_pci_dev(struct pci_dev *dev)
1006
{
1007
	INIT_LIST_HEAD(&dev->msi_list);
1008 1009 1010 1011 1012

	/* Disable the msi hardware to avoid screaming interrupts
	 * during boot.  This is the power on reset default so
	 * usually this should be a noop.
	 */
1013 1014 1015 1016 1017 1018
	dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
	if (dev->msi_cap)
		msi_set_enable(dev, 0);

	dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
	if (dev->msix_cap)
1019
		msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
1020
}
1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035

/**
 * pci_enable_msi_range - configure device's MSI capability structure
 * @dev: device to configure
 * @minvec: minimal number of interrupts to configure
 * @maxvec: maximum number of interrupts to configure
 *
 * This function tries to allocate a maximum possible number of interrupts in a
 * range between @minvec and @maxvec. It returns a negative errno if an error
 * occurs. If it succeeds, it returns the actual number of interrupts allocated
 * and updates the @dev's irq member to the lowest new interrupt number;
 * the other interrupt numbers allocated to this device are consecutive.
 **/
int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
{
1036
	int nvec;
1037 1038
	int rc;

1039 1040
	if (!pci_msi_supported(dev, minvec))
		return -EINVAL;
1041 1042 1043 1044 1045 1046 1047 1048 1049 1050

	WARN_ON(!!dev->msi_enabled);

	/* Check whether driver already requested MSI-X irqs */
	if (dev->msix_enabled) {
		dev_info(&dev->dev,
			 "can't enable MSI (MSI-X already enabled)\n");
		return -EINVAL;
	}

1051 1052 1053
	if (maxvec < minvec)
		return -ERANGE;

1054 1055 1056 1057 1058 1059 1060 1061
	nvec = pci_msi_vec_count(dev);
	if (nvec < 0)
		return nvec;
	else if (nvec < minvec)
		return -EINVAL;
	else if (nvec > maxvec)
		nvec = maxvec;

1062
	do {
1063
		rc = msi_capability_init(dev, nvec);
1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114
		if (rc < 0) {
			return rc;
		} else if (rc > 0) {
			if (rc < minvec)
				return -ENOSPC;
			nvec = rc;
		}
	} while (rc);

	return nvec;
}
EXPORT_SYMBOL(pci_enable_msi_range);

/**
 * pci_enable_msix_range - configure device's MSI-X capability structure
 * @dev: pointer to the pci_dev data structure of MSI-X device function
 * @entries: pointer to an array of MSI-X entries
 * @minvec: minimum number of MSI-X irqs requested
 * @maxvec: maximum number of MSI-X irqs requested
 *
 * Setup the MSI-X capability structure of device function with a maximum
 * possible number of interrupts in the range between @minvec and @maxvec
 * upon its software driver call to request for MSI-X mode enabled on its
 * hardware device function. It returns a negative errno if an error occurs.
 * If it succeeds, it returns the actual number of interrupts allocated and
 * indicates the successful configuration of MSI-X capability structure
 * with new allocated MSI-X interrupts.
 **/
int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
			       int minvec, int maxvec)
{
	int nvec = maxvec;
	int rc;

	if (maxvec < minvec)
		return -ERANGE;

	do {
		rc = pci_enable_msix(dev, entries, nvec);
		if (rc < 0) {
			return rc;
		} else if (rc > 0) {
			if (rc < minvec)
				return -ENOSPC;
			nvec = rc;
		}
	} while (rc);

	return nvec;
}
EXPORT_SYMBOL(pci_enable_msix_range);