amd_iommu.c 86.2 KB
Newer Older
1
/*
2
 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
J
Joerg Roedel 已提交
3
 * Author: Joerg Roedel <jroedel@suse.de>
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
 *         Leo Duran <leo.duran@amd.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 */

20
#include <linux/ratelimit.h>
21
#include <linux/pci.h>
22
#include <linux/acpi.h>
23
#include <linux/amba/bus.h>
24
#include <linux/platform_device.h>
25
#include <linux/pci-ats.h>
A
Akinobu Mita 已提交
26
#include <linux/bitmap.h>
27
#include <linux/slab.h>
28
#include <linux/debugfs.h>
29
#include <linux/scatterlist.h>
30
#include <linux/dma-mapping.h>
31
#include <linux/iommu-helper.h>
32
#include <linux/iommu.h>
33
#include <linux/delay.h>
34
#include <linux/amd-iommu.h>
35 36
#include <linux/notifier.h>
#include <linux/export.h>
37 38
#include <linux/irq.h>
#include <linux/msi.h>
39
#include <linux/dma-contiguous.h>
40
#include <linux/irqdomain.h>
41
#include <linux/percpu.h>
42
#include <linux/iova.h>
43 44 45 46
#include <asm/irq_remapping.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
#include <asm/hw_irq.h>
47
#include <asm/msidef.h>
48
#include <asm/proto.h>
49
#include <asm/iommu.h>
50
#include <asm/gart.h>
51
#include <asm/dma.h>
52 53 54

#include "amd_iommu_proto.h"
#include "amd_iommu_types.h"
55
#include "irq_remapping.h"
56 57 58

#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))

59
#define LOOP_TIMEOUT	100000
60

61 62 63 64 65
/* IO virtual address start page frame number */
#define IOVA_START_PFN		(1)
#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
#define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))

66 67 68 69 70 71
/* Reserved IOVA ranges */
#define MSI_RANGE_START		(0xfee00000)
#define MSI_RANGE_END		(0xfeefffff)
#define HT_RANGE_START		(0xfd00000000ULL)
#define HT_RANGE_END		(0xffffffffffULL)

72 73 74 75 76 77
/*
 * This bitmap is used to advertise the page sizes our hardware support
 * to the IOMMU core, which will then use this information to split
 * physically contiguous memory regions it is mapping into page sizes
 * that we support.
 *
J
Joerg Roedel 已提交
78
 * 512GB Pages are not supported due to a hardware bug
79
 */
J
Joerg Roedel 已提交
80
#define AMD_IOMMU_PGSIZES	((~0xFFFUL) & ~(2ULL << 38))
81

82 83
static DEFINE_RWLOCK(amd_iommu_devtable_lock);

84 85 86 87
/* List of all available dev_data structures */
static LIST_HEAD(dev_data_list);
static DEFINE_SPINLOCK(dev_data_list_lock);

88 89
LIST_HEAD(ioapic_map);
LIST_HEAD(hpet_map);
90
LIST_HEAD(acpihid_map);
91

92 93 94 95
/*
 * Domain for untranslated devices - only allocated
 * if iommu=pt passed on kernel cmd line.
 */
96
static const struct iommu_ops amd_iommu_ops;
97

98
static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
99
int amd_iommu_max_glx_val = -1;
100

101 102
static struct dma_map_ops amd_iommu_dma_ops;

103 104 105 106 107 108 109 110
/*
 * This struct contains device specific data for the IOMMU
 */
struct iommu_dev_data {
	struct list_head list;		  /* For domain->dev_list */
	struct list_head dev_data_list;	  /* For global dev_data_list */
	struct protection_domain *domain; /* Domain the device is bound to */
	u16 devid;			  /* PCI Device ID */
111
	u16 alias;			  /* Alias Device ID */
112
	bool iommu_v2;			  /* Device can make use of IOMMUv2 */
113
	bool passthrough;		  /* Device is identity mapped */
114 115 116 117 118 119 120 121 122
	struct {
		bool enabled;
		int qdep;
	} ats;				  /* ATS state */
	bool pri_tlp;			  /* PASID TLB required for
					     PPR completions */
	u32 errata;			  /* Bitmap for errata to apply */
};

123 124 125
/*
 * general struct to manage commands send to an IOMMU
 */
126
struct iommu_cmd {
127 128 129
	u32 data[4];
};

130 131
struct kmem_cache *amd_iommu_irq_cache;

132
static void update_domain(struct protection_domain *domain);
133
static int protection_domain_init(struct protection_domain *domain);
134
static void detach_device(struct device *dev);
135

136 137 138 139 140 141 142
/*
 * Data container for a dma_ops specific protection domain
 */
struct dma_ops_domain {
	/* generic protection domain information */
	struct protection_domain domain;

143 144
	/* IOVA RB-Tree */
	struct iova_domain iovad;
145 146
};

147 148 149
static struct iova_domain reserved_iova_ranges;
static struct lock_class_key reserved_rbtree_key;

150 151 152 153 154 155
/****************************************************************************
 *
 * Helper functions
 *
 ****************************************************************************/

156 157
static inline int match_hid_uid(struct device *dev,
				struct acpihid_map_entry *entry)
158
{
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
	const char *hid, *uid;

	hid = acpi_device_hid(ACPI_COMPANION(dev));
	uid = acpi_device_uid(ACPI_COMPANION(dev));

	if (!hid || !(*hid))
		return -ENODEV;

	if (!uid || !(*uid))
		return strcmp(hid, entry->hid);

	if (!(*entry->uid))
		return strcmp(hid, entry->hid);

	return (strcmp(hid, entry->hid) || strcmp(uid, entry->uid));
174 175
}

176
static inline u16 get_pci_device_id(struct device *dev)
177 178 179 180 181 182
{
	struct pci_dev *pdev = to_pci_dev(dev);

	return PCI_DEVID(pdev->bus->number, pdev->devfn);
}

183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
static inline int get_acpihid_device_id(struct device *dev,
					struct acpihid_map_entry **entry)
{
	struct acpihid_map_entry *p;

	list_for_each_entry(p, &acpihid_map, list) {
		if (!match_hid_uid(dev, p)) {
			if (entry)
				*entry = p;
			return p->devid;
		}
	}
	return -EINVAL;
}

static inline int get_device_id(struct device *dev)
{
	int devid;

	if (dev_is_pci(dev))
		devid = get_pci_device_id(dev);
	else
		devid = get_acpihid_device_id(dev, NULL);

	return devid;
}

210 211 212 213 214
static struct protection_domain *to_pdomain(struct iommu_domain *dom)
{
	return container_of(dom, struct protection_domain, domain);
}

215
static struct iommu_dev_data *alloc_dev_data(u16 devid)
216 217 218 219 220 221 222 223
{
	struct iommu_dev_data *dev_data;
	unsigned long flags;

	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
	if (!dev_data)
		return NULL;

224
	dev_data->devid = devid;
225 226 227 228 229 230 231 232

	spin_lock_irqsave(&dev_data_list_lock, flags);
	list_add_tail(&dev_data->dev_data_list, &dev_data_list);
	spin_unlock_irqrestore(&dev_data_list_lock, flags);

	return dev_data;
}

233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
static struct iommu_dev_data *search_dev_data(u16 devid)
{
	struct iommu_dev_data *dev_data;
	unsigned long flags;

	spin_lock_irqsave(&dev_data_list_lock, flags);
	list_for_each_entry(dev_data, &dev_data_list, dev_data_list) {
		if (dev_data->devid == devid)
			goto out_unlock;
	}

	dev_data = NULL;

out_unlock:
	spin_unlock_irqrestore(&dev_data_list_lock, flags);

	return dev_data;
}

252 253 254 255 256 257 258 259 260 261 262
static int __last_alias(struct pci_dev *pdev, u16 alias, void *data)
{
	*(u16 *)data = alias;
	return 0;
}

static u16 get_alias(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	u16 devid, ivrs_alias, pci_alias;

263
	/* The callers make sure that get_device_id() does not fail here */
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
	devid = get_device_id(dev);
	ivrs_alias = amd_iommu_alias_table[devid];
	pci_for_each_dma_alias(pdev, __last_alias, &pci_alias);

	if (ivrs_alias == pci_alias)
		return ivrs_alias;

	/*
	 * DMA alias showdown
	 *
	 * The IVRS is fairly reliable in telling us about aliases, but it
	 * can't know about every screwy device.  If we don't have an IVRS
	 * reported alias, use the PCI reported alias.  In that case we may
	 * still need to initialize the rlookup and dev_table entries if the
	 * alias is to a non-existent device.
	 */
	if (ivrs_alias == devid) {
		if (!amd_iommu_rlookup_table[pci_alias]) {
			amd_iommu_rlookup_table[pci_alias] =
				amd_iommu_rlookup_table[devid];
			memcpy(amd_iommu_dev_table[pci_alias].data,
			       amd_iommu_dev_table[devid].data,
			       sizeof(amd_iommu_dev_table[pci_alias].data));
		}

		return pci_alias;
	}

	pr_info("AMD-Vi: Using IVRS reported alias %02x:%02x.%d "
		"for device %s[%04x:%04x], kernel reported alias "
		"%02x:%02x.%d\n", PCI_BUS_NUM(ivrs_alias), PCI_SLOT(ivrs_alias),
		PCI_FUNC(ivrs_alias), dev_name(dev), pdev->vendor, pdev->device,
		PCI_BUS_NUM(pci_alias), PCI_SLOT(pci_alias),
		PCI_FUNC(pci_alias));

	/*
	 * If we don't have a PCI DMA alias and the IVRS alias is on the same
	 * bus, then the IVRS table may know about a quirk that we don't.
	 */
	if (pci_alias == devid &&
	    PCI_BUS_NUM(ivrs_alias) == pdev->bus->number) {
305
		pci_add_dma_alias(pdev, ivrs_alias & 0xff);
306 307 308 309 310 311 312 313
		pr_info("AMD-Vi: Added PCI DMA alias %02x.%d for %s\n",
			PCI_SLOT(ivrs_alias), PCI_FUNC(ivrs_alias),
			dev_name(dev));
	}

	return ivrs_alias;
}

314 315 316 317 318 319 320 321 322 323 324 325
static struct iommu_dev_data *find_dev_data(u16 devid)
{
	struct iommu_dev_data *dev_data;

	dev_data = search_dev_data(devid);

	if (dev_data == NULL)
		dev_data = alloc_dev_data(devid);

	return dev_data;
}

326 327 328 329 330
static struct iommu_dev_data *get_dev_data(struct device *dev)
{
	return dev->archdata.iommu;
}

331 332 333 334
/*
* Find or create an IOMMU group for a acpihid device.
*/
static struct iommu_group *acpihid_device_group(struct device *dev)
335
{
336
	struct acpihid_map_entry *p, *entry = NULL;
337
	int devid;
338 339 340 341 342 343 344 345 346 347 348 349 350 351

	devid = get_acpihid_device_id(dev, &entry);
	if (devid < 0)
		return ERR_PTR(devid);

	list_for_each_entry(p, &acpihid_map, list) {
		if ((devid == p->devid) && p->group)
			entry->group = p->group;
	}

	if (!entry->group)
		entry->group = generic_device_group(dev);

	return entry->group;
352 353
}

354 355 356 357
static bool pci_iommuv2_capable(struct pci_dev *pdev)
{
	static const int caps[] = {
		PCI_EXT_CAP_ID_ATS,
358 359
		PCI_EXT_CAP_ID_PRI,
		PCI_EXT_CAP_ID_PASID,
360 361 362 363 364 365 366 367 368 369 370 371
	};
	int i, pos;

	for (i = 0; i < 3; ++i) {
		pos = pci_find_ext_capability(pdev, caps[i]);
		if (pos == 0)
			return false;
	}

	return true;
}

372 373 374 375 376 377 378 379 380
static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum)
{
	struct iommu_dev_data *dev_data;

	dev_data = get_dev_data(&pdev->dev);

	return dev_data->errata & (1 << erratum) ? true : false;
}

381 382 383 384 385 386
/*
 * This function checks if the driver got a valid device from the caller to
 * avoid dereferencing invalid pointers.
 */
static bool check_device(struct device *dev)
{
387
	int devid;
388 389 390 391 392

	if (!dev || !dev->dma_mask)
		return false;

	devid = get_device_id(dev);
393
	if (devid < 0)
394
		return false;
395 396 397 398 399 400 401 402 403 404 405

	/* Out of our scope? */
	if (devid > amd_iommu_last_bdf)
		return false;

	if (amd_iommu_rlookup_table[devid] == NULL)
		return false;

	return true;
}

406
static void init_iommu_group(struct device *dev)
407 408 409
{
	struct iommu_group *group;

410
	group = iommu_group_get_for_dev(dev);
411 412 413 414
	if (IS_ERR(group))
		return;

	iommu_group_put(group);
415 416 417 418 419
}

static int iommu_init_device(struct device *dev)
{
	struct iommu_dev_data *dev_data;
420
	int devid;
421 422 423 424

	if (dev->archdata.iommu)
		return 0;

425
	devid = get_device_id(dev);
426
	if (devid < 0)
427 428 429
		return devid;

	dev_data = find_dev_data(devid);
430 431 432
	if (!dev_data)
		return -ENOMEM;

433 434
	dev_data->alias = get_alias(dev);

435
	if (dev_is_pci(dev) && pci_iommuv2_capable(to_pci_dev(dev))) {
436 437
		struct amd_iommu *iommu;

438
		iommu = amd_iommu_rlookup_table[dev_data->devid];
439 440 441
		dev_data->iommu_v2 = iommu->is_iommu_v2;
	}

442 443
	dev->archdata.iommu = dev_data;

A
Alex Williamson 已提交
444 445 446
	iommu_device_link(amd_iommu_rlookup_table[dev_data->devid]->iommu_dev,
			  dev);

447 448 449
	return 0;
}

450 451
static void iommu_ignore_device(struct device *dev)
{
452 453
	u16 alias;
	int devid;
454 455

	devid = get_device_id(dev);
456
	if (devid < 0)
457 458
		return;

459
	alias = get_alias(dev);
460 461 462 463 464 465 466 467

	memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
	memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));

	amd_iommu_rlookup_table[devid] = NULL;
	amd_iommu_rlookup_table[alias] = NULL;
}

468 469
static void iommu_uninit_device(struct device *dev)
{
470 471
	int devid;
	struct iommu_dev_data *dev_data;
472

473
	devid = get_device_id(dev);
474
	if (devid < 0)
475
		return;
476

477
	dev_data = search_dev_data(devid);
478 479 480
	if (!dev_data)
		return;

481 482 483
	if (dev_data->domain)
		detach_device(dev);

A
Alex Williamson 已提交
484 485 486
	iommu_device_unlink(amd_iommu_rlookup_table[dev_data->devid]->iommu_dev,
			    dev);

487 488
	iommu_group_remove_device(dev);

489 490 491
	/* Remove dma-ops */
	dev->archdata.dma_ops = NULL;

492
	/*
493 494
	 * We keep dev_data around for unplugged devices and reuse it when the
	 * device is re-plugged - not doing so would introduce a ton of races.
495
	 */
496
}
J
Joerg Roedel 已提交
497

498 499 500 501 502 503
/****************************************************************************
 *
 * Interrupt handling functions
 *
 ****************************************************************************/

504 505 506 507
static void dump_dte_entry(u16 devid)
{
	int i;

508 509
	for (i = 0; i < 4; ++i)
		pr_err("AMD-Vi: DTE[%d]: %016llx\n", i,
510 511 512
			amd_iommu_dev_table[devid].data[i]);
}

513 514 515 516 517 518 519 520 521
static void dump_command(unsigned long phys_addr)
{
	struct iommu_cmd *cmd = phys_to_virt(phys_addr);
	int i;

	for (i = 0; i < 4; ++i)
		pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
}

522
static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
523
{
524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544
	int type, devid, domid, flags;
	volatile u32 *event = __evt;
	int count = 0;
	u64 address;

retry:
	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
	domid   = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
	address = (u64)(((u64)event[3]) << 32) | event[2];

	if (type == 0) {
		/* Did we hit the erratum? */
		if (++count == LOOP_TIMEOUT) {
			pr_err("AMD-Vi: No event written to event log\n");
			return;
		}
		udelay(1);
		goto retry;
	}
545

546
	printk(KERN_ERR "AMD-Vi: Event logged [");
547 548 549 550 551

	switch (type) {
	case EVENT_TYPE_ILL_DEV:
		printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
		       "address=0x%016llx flags=0x%04x]\n",
552
		       PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
553
		       address, flags);
554
		dump_dte_entry(devid);
555 556 557 558
		break;
	case EVENT_TYPE_IO_FAULT:
		printk("IO_PAGE_FAULT device=%02x:%02x.%x "
		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
559
		       PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
560 561 562 563 564
		       domid, address, flags);
		break;
	case EVENT_TYPE_DEV_TAB_ERR:
		printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
		       "address=0x%016llx flags=0x%04x]\n",
565
		       PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
566 567 568 569 570
		       address, flags);
		break;
	case EVENT_TYPE_PAGE_TAB_ERR:
		printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
571
		       PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
572 573 574 575
		       domid, address, flags);
		break;
	case EVENT_TYPE_ILL_CMD:
		printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
576
		dump_command(address);
577 578 579 580 581 582 583 584
		break;
	case EVENT_TYPE_CMD_HARD_ERR:
		printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
		       "flags=0x%04x]\n", address, flags);
		break;
	case EVENT_TYPE_IOTLB_INV_TO:
		printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
		       "address=0x%016llx]\n",
585
		       PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
586 587 588 589 590
		       address);
		break;
	case EVENT_TYPE_INV_DEV_REQ:
		printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
		       "address=0x%016llx flags=0x%04x]\n",
591
		       PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
592 593 594 595 596
		       address, flags);
		break;
	default:
		printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
	}
597 598

	memset(__evt, 0, 4 * sizeof(u32));
599 600 601 602 603 604 605 606 607 608
}

static void iommu_poll_events(struct amd_iommu *iommu)
{
	u32 head, tail;

	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);

	while (head != tail) {
609
		iommu_print_event(iommu, iommu->evt_buf + head);
610
		head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
611 612 613 614 615
	}

	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
}

616
static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644
{
	struct amd_iommu_fault fault;

	if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) {
		pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n");
		return;
	}

	fault.address   = raw[1];
	fault.pasid     = PPR_PASID(raw[0]);
	fault.device_id = PPR_DEVID(raw[0]);
	fault.tag       = PPR_TAG(raw[0]);
	fault.flags     = PPR_FLAGS(raw[0]);

	atomic_notifier_call_chain(&ppr_notifier, 0, &fault);
}

static void iommu_poll_ppr_log(struct amd_iommu *iommu)
{
	u32 head, tail;

	if (iommu->ppr_log == NULL)
		return;

	head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
	tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);

	while (head != tail) {
645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660
		volatile u64 *raw;
		u64 entry[2];
		int i;

		raw = (u64 *)(iommu->ppr_log + head);

		/*
		 * Hardware bug: Interrupt may arrive before the entry is
		 * written to memory. If this happens we need to wait for the
		 * entry to arrive.
		 */
		for (i = 0; i < LOOP_TIMEOUT; ++i) {
			if (PPR_REQ_TYPE(raw[0]) != 0)
				break;
			udelay(1);
		}
661

662 663 664
		/* Avoid memcpy function-call overhead */
		entry[0] = raw[0];
		entry[1] = raw[1];
665

666 667 668 669 670 671 672
		/*
		 * To detect the hardware bug we need to clear the entry
		 * back to zero.
		 */
		raw[0] = raw[1] = 0UL;

		/* Update head pointer of hardware ring-buffer */
673 674
		head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
		writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
675 676 677 678 679 680

		/* Handle PPR entry */
		iommu_handle_ppr_entry(iommu, entry);

		/* Refresh ring-buffer information */
		head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
681 682 683 684
		tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
	}
}

685
irqreturn_t amd_iommu_int_thread(int irq, void *data)
686
{
687 688
	struct amd_iommu *iommu = (struct amd_iommu *) data;
	u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
689

690 691 692 693
	while (status & (MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK)) {
		/* Enable EVT and PPR interrupts again */
		writel((MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK),
			iommu->mmio_base + MMIO_STATUS_OFFSET);
694

695 696 697 698
		if (status & MMIO_STATUS_EVT_INT_MASK) {
			pr_devel("AMD-Vi: Processing IOMMU Event Log\n");
			iommu_poll_events(iommu);
		}
699

700 701 702 703
		if (status & MMIO_STATUS_PPR_INT_MASK) {
			pr_devel("AMD-Vi: Processing IOMMU PPR Log\n");
			iommu_poll_ppr_log(iommu);
		}
704

705 706 707 708 709 710 711 712 713 714 715 716 717 718 719
		/*
		 * Hardware bug: ERBT1312
		 * When re-enabling interrupt (by writing 1
		 * to clear the bit), the hardware might also try to set
		 * the interrupt bit in the event status register.
		 * In this scenario, the bit will be set, and disable
		 * subsequent interrupts.
		 *
		 * Workaround: The IOMMU driver should read back the
		 * status register and check if the interrupt bits are cleared.
		 * If not, driver will need to go through the interrupt handler
		 * again and re-clear the bits
		 */
		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
	}
720
	return IRQ_HANDLED;
721 722
}

723 724 725 726 727
irqreturn_t amd_iommu_int_handler(int irq, void *data)
{
	return IRQ_WAKE_THREAD;
}

728 729 730 731 732 733
/****************************************************************************
 *
 * IOMMU command queuing functions
 *
 ****************************************************************************/

734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753
static int wait_on_sem(volatile u64 *sem)
{
	int i = 0;

	while (*sem == 0 && i < LOOP_TIMEOUT) {
		udelay(1);
		i += 1;
	}

	if (i == LOOP_TIMEOUT) {
		pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
		return -EIO;
	}

	return 0;
}

static void copy_cmd_to_buffer(struct amd_iommu *iommu,
			       struct iommu_cmd *cmd,
			       u32 tail)
754 755 756
{
	u8 *target;

757
	target = iommu->cmd_buf + tail;
758
	tail   = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
759 760 761 762 763

	/* Copy command to buffer */
	memcpy(target, cmd, sizeof(*cmd));

	/* Tell the IOMMU about it */
764
	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
765
}
766

767
static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
768
{
769 770
	WARN_ON(address & 0x7ULL);

771
	memset(cmd, 0, sizeof(*cmd));
772 773 774
	cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
	cmd->data[1] = upper_32_bits(__pa(address));
	cmd->data[2] = 1;
775 776 777
	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
}

778 779 780 781 782 783 784
static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
{
	memset(cmd, 0, sizeof(*cmd));
	cmd->data[0] = devid;
	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
}

785 786 787 788
static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
				  size_t size, u16 domid, int pde)
{
	u64 pages;
789
	bool s;
790 791

	pages = iommu_num_pages(address, size, PAGE_SIZE);
792
	s     = false;
793 794 795 796 797 798 799

	if (pages > 1) {
		/*
		 * If we have to flush more than one page, flush all
		 * TLB entries for this domain
		 */
		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
800
		s = true;
801 802 803 804 805 806 807 808 809 810 811
	}

	address &= PAGE_MASK;

	memset(cmd, 0, sizeof(*cmd));
	cmd->data[1] |= domid;
	cmd->data[2]  = lower_32_bits(address);
	cmd->data[3]  = upper_32_bits(address);
	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
	if (s) /* size bit - we flush more than one 4kb page */
		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
F
Frank Arnold 已提交
812
	if (pde) /* PDE bit - we want to flush everything, not only the PTEs */
813 814 815
		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
}

816 817 818 819
static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
				  u64 address, size_t size)
{
	u64 pages;
820
	bool s;
821 822

	pages = iommu_num_pages(address, size, PAGE_SIZE);
823
	s     = false;
824 825 826 827 828 829 830

	if (pages > 1) {
		/*
		 * If we have to flush more than one page, flush all
		 * TLB entries for this domain
		 */
		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
831
		s = true;
832 833 834 835 836 837 838 839 840 841 842 843 844 845 846
	}

	address &= PAGE_MASK;

	memset(cmd, 0, sizeof(*cmd));
	cmd->data[0]  = devid;
	cmd->data[0] |= (qdep & 0xff) << 24;
	cmd->data[1]  = devid;
	cmd->data[2]  = lower_32_bits(address);
	cmd->data[3]  = upper_32_bits(address);
	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
	if (s)
		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
}

847 848 849 850 851 852 853
static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid,
				  u64 address, bool size)
{
	memset(cmd, 0, sizeof(*cmd));

	address &= ~(0xfffULL);

854
	cmd->data[0]  = pasid;
855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872
	cmd->data[1]  = domid;
	cmd->data[2]  = lower_32_bits(address);
	cmd->data[3]  = upper_32_bits(address);
	cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
	cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
	if (size)
		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
}

static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid,
				  int qdep, u64 address, bool size)
{
	memset(cmd, 0, sizeof(*cmd));

	address &= ~(0xfffULL);

	cmd->data[0]  = devid;
873
	cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
874 875
	cmd->data[0] |= (qdep  & 0xff) << 24;
	cmd->data[1]  = devid;
876
	cmd->data[1] |= (pasid & 0xff) << 16;
877 878 879 880 881 882 883 884
	cmd->data[2]  = lower_32_bits(address);
	cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
	cmd->data[3]  = upper_32_bits(address);
	if (size)
		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
}

885 886 887 888 889 890 891
static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, int pasid,
			       int status, int tag, bool gn)
{
	memset(cmd, 0, sizeof(*cmd));

	cmd->data[0]  = devid;
	if (gn) {
892
		cmd->data[1]  = pasid;
893 894 895 896 897 898 899 900
		cmd->data[2]  = CMD_INV_IOMMU_PAGES_GN_MASK;
	}
	cmd->data[3]  = tag & 0x1ff;
	cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;

	CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
}

901 902 903 904
static void build_inv_all(struct iommu_cmd *cmd)
{
	memset(cmd, 0, sizeof(*cmd));
	CMD_SET_TYPE(cmd, CMD_INV_ALL);
905 906
}

907 908 909 910 911 912 913
static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
{
	memset(cmd, 0, sizeof(*cmd));
	cmd->data[0] = devid;
	CMD_SET_TYPE(cmd, CMD_INV_IRT);
}

914 915
/*
 * Writes the command to the IOMMUs command buffer and informs the
916
 * hardware about the new command.
917
 */
918 919 920
static int iommu_queue_command_sync(struct amd_iommu *iommu,
				    struct iommu_cmd *cmd,
				    bool sync)
921
{
922
	u32 left, tail, head, next_tail;
923 924
	unsigned long flags;

925
again:
926 927
	spin_lock_irqsave(&iommu->lock, flags);

928 929
	head      = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
	tail      = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
930 931
	next_tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
	left      = (head - next_tail) % CMD_BUFFER_SIZE;
932

933 934 935 936
	if (left <= 2) {
		struct iommu_cmd sync_cmd;
		volatile u64 sem = 0;
		int ret;
937

938 939
		build_completion_wait(&sync_cmd, (u64)&sem);
		copy_cmd_to_buffer(iommu, &sync_cmd, tail);
940

941 942 943 944 945 946
		spin_unlock_irqrestore(&iommu->lock, flags);

		if ((ret = wait_on_sem(&sem)) != 0)
			return ret;

		goto again;
947 948
	}

949 950 951
	copy_cmd_to_buffer(iommu, cmd, tail);

	/* We need to sync now to make sure all commands are processed */
952
	iommu->need_sync = sync;
953

954
	spin_unlock_irqrestore(&iommu->lock, flags);
955

956
	return 0;
957 958
}

959 960 961 962 963
static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
{
	return iommu_queue_command_sync(iommu, cmd, true);
}

964 965 966 967
/*
 * This function queues a completion wait command into the command
 * buffer of an IOMMU
 */
968
static int iommu_completion_wait(struct amd_iommu *iommu)
969 970
{
	struct iommu_cmd cmd;
971
	volatile u64 sem = 0;
972
	int ret;
973

974
	if (!iommu->need_sync)
975
		return 0;
976

977
	build_completion_wait(&cmd, (u64)&sem);
978

979
	ret = iommu_queue_command_sync(iommu, &cmd, false);
980
	if (ret)
981
		return ret;
982

983
	return wait_on_sem(&sem);
984 985
}

986
static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
987
{
988
	struct iommu_cmd cmd;
989

990
	build_inv_dte(&cmd, devid);
991

992 993
	return iommu_queue_command(iommu, &cmd);
}
994

995 996 997
static void iommu_flush_dte_all(struct amd_iommu *iommu)
{
	u32 devid;
998

999 1000
	for (devid = 0; devid <= 0xffff; ++devid)
		iommu_flush_dte(iommu, devid);
1001

1002 1003
	iommu_completion_wait(iommu);
}
1004

1005 1006 1007 1008 1009 1010 1011
/*
 * This function uses heavy locking and may disable irqs for some time. But
 * this is no issue because it is only called during resume.
 */
static void iommu_flush_tlb_all(struct amd_iommu *iommu)
{
	u32 dom_id;
1012

1013 1014 1015 1016 1017 1018
	for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
		struct iommu_cmd cmd;
		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
				      dom_id, 1);
		iommu_queue_command(iommu, &cmd);
	}
1019

1020
	iommu_completion_wait(iommu);
1021 1022
}

1023
static void iommu_flush_all(struct amd_iommu *iommu)
1024
{
1025
	struct iommu_cmd cmd;
1026

1027
	build_inv_all(&cmd);
1028

1029 1030 1031 1032
	iommu_queue_command(iommu, &cmd);
	iommu_completion_wait(iommu);
}

1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051
static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
{
	struct iommu_cmd cmd;

	build_inv_irt(&cmd, devid);

	iommu_queue_command(iommu, &cmd);
}

static void iommu_flush_irt_all(struct amd_iommu *iommu)
{
	u32 devid;

	for (devid = 0; devid <= MAX_DEV_TABLE_ENTRIES; devid++)
		iommu_flush_irt(iommu, devid);

	iommu_completion_wait(iommu);
}

1052 1053
void iommu_flush_all_caches(struct amd_iommu *iommu)
{
1054 1055 1056 1057
	if (iommu_feature(iommu, FEATURE_IA)) {
		iommu_flush_all(iommu);
	} else {
		iommu_flush_dte_all(iommu);
1058
		iommu_flush_irt_all(iommu);
1059
		iommu_flush_tlb_all(iommu);
1060 1061 1062
	}
}

1063
/*
1064
 * Command send function for flushing on-device TLB
1065
 */
1066 1067
static int device_flush_iotlb(struct iommu_dev_data *dev_data,
			      u64 address, size_t size)
1068 1069
{
	struct amd_iommu *iommu;
1070
	struct iommu_cmd cmd;
1071
	int qdep;
1072

1073 1074
	qdep     = dev_data->ats.qdep;
	iommu    = amd_iommu_rlookup_table[dev_data->devid];
1075

1076
	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size);
1077 1078

	return iommu_queue_command(iommu, &cmd);
1079 1080
}

1081 1082 1083
/*
 * Command send function for invalidating a device table entry
 */
1084
static int device_flush_dte(struct iommu_dev_data *dev_data)
1085
{
1086
	struct amd_iommu *iommu;
1087
	u16 alias;
1088
	int ret;
1089

1090
	iommu = amd_iommu_rlookup_table[dev_data->devid];
1091
	alias = dev_data->alias;
1092

1093
	ret = iommu_flush_dte(iommu, dev_data->devid);
1094 1095
	if (!ret && alias != dev_data->devid)
		ret = iommu_flush_dte(iommu, alias);
1096 1097 1098
	if (ret)
		return ret;

1099
	if (dev_data->ats.enabled)
1100
		ret = device_flush_iotlb(dev_data, 0, ~0UL);
1101 1102

	return ret;
1103 1104
}

1105 1106 1107 1108 1109
/*
 * TLB invalidation function which is called from the mapping functions.
 * It invalidates a single PTE if the range to flush is within a single
 * page. Otherwise it flushes the whole TLB of the IOMMU.
 */
1110 1111
static void __domain_flush_pages(struct protection_domain *domain,
				 u64 address, size_t size, int pde)
1112
{
1113
	struct iommu_dev_data *dev_data;
1114 1115
	struct iommu_cmd cmd;
	int ret = 0, i;
1116

1117
	build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
1118

1119 1120 1121 1122 1123 1124 1125 1126
	for (i = 0; i < amd_iommus_present; ++i) {
		if (!domain->dev_iommu[i])
			continue;

		/*
		 * Devices of this domain are behind this IOMMU
		 * We need a TLB flush
		 */
1127
		ret |= iommu_queue_command(amd_iommus[i], &cmd);
1128 1129
	}

1130 1131
	list_for_each_entry(dev_data, &domain->dev_list, list) {

1132
		if (!dev_data->ats.enabled)
1133 1134
			continue;

1135
		ret |= device_flush_iotlb(dev_data, address, size);
1136 1137
	}

1138
	WARN_ON(ret);
1139 1140
}

1141 1142
static void domain_flush_pages(struct protection_domain *domain,
			       u64 address, size_t size)
1143
{
1144
	__domain_flush_pages(domain, address, size, 0);
1145
}
1146

1147
/* Flush the whole IO/TLB for a given protection domain */
1148
static void domain_flush_tlb(struct protection_domain *domain)
1149
{
1150
	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
1151 1152
}

1153
/* Flush the whole IO/TLB for a given protection domain - including PDE */
1154
static void domain_flush_tlb_pde(struct protection_domain *domain)
1155
{
1156
	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
1157 1158
}

1159
static void domain_flush_complete(struct protection_domain *domain)
1160
{
1161
	int i;
1162

1163 1164 1165
	for (i = 0; i < amd_iommus_present; ++i) {
		if (!domain->dev_iommu[i])
			continue;
1166

1167 1168 1169 1170 1171
		/*
		 * Devices of this domain are behind this IOMMU
		 * We need to wait for completion of all commands.
		 */
		iommu_completion_wait(amd_iommus[i]);
1172
	}
1173 1174
}

1175

1176
/*
1177
 * This function flushes the DTEs for all devices in domain
1178
 */
1179
static void domain_flush_devices(struct protection_domain *domain)
1180
{
1181
	struct iommu_dev_data *dev_data;
1182

1183
	list_for_each_entry(dev_data, &domain->dev_list, list)
1184
		device_flush_dte(dev_data);
1185 1186
}

1187 1188 1189 1190 1191 1192 1193
/****************************************************************************
 *
 * The functions below are used the create the page table mappings for
 * unity mapped regions.
 *
 ****************************************************************************/

1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222
/*
 * This function is used to add another level to an IO page table. Adding
 * another level increases the size of the address space by 9 bits to a size up
 * to 64 bits.
 */
static bool increase_address_space(struct protection_domain *domain,
				   gfp_t gfp)
{
	u64 *pte;

	if (domain->mode == PAGE_MODE_6_LEVEL)
		/* address space already 64 bit large */
		return false;

	pte = (void *)get_zeroed_page(gfp);
	if (!pte)
		return false;

	*pte             = PM_LEVEL_PDE(domain->mode,
					virt_to_phys(domain->pt_root));
	domain->pt_root  = pte;
	domain->mode    += 1;
	domain->updated  = true;

	return true;
}

static u64 *alloc_pte(struct protection_domain *domain,
		      unsigned long address,
1223
		      unsigned long page_size,
1224 1225 1226
		      u64 **pte_page,
		      gfp_t gfp)
{
1227
	int level, end_lvl;
1228
	u64 *pte, *page;
1229 1230

	BUG_ON(!is_power_of_2(page_size));
1231 1232 1233 1234

	while (address > PM_LEVEL_SIZE(domain->mode))
		increase_address_space(domain, gfp);

1235 1236 1237 1238
	level   = domain->mode - 1;
	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
	address = PAGE_SIZE_ALIGN(address, page_size);
	end_lvl = PAGE_SIZE_LEVEL(page_size);
1239 1240

	while (level > end_lvl) {
1241 1242 1243 1244 1245
		u64 __pte, __npte;

		__pte = *pte;

		if (!IOMMU_PTE_PRESENT(__pte)) {
1246 1247 1248
			page = (u64 *)get_zeroed_page(gfp);
			if (!page)
				return NULL;
1249 1250 1251 1252 1253 1254 1255

			__npte = PM_LEVEL_PDE(level, virt_to_phys(page));

			if (cmpxchg64(pte, __pte, __npte)) {
				free_page((unsigned long)page);
				continue;
			}
1256 1257
		}

1258 1259 1260 1261
		/* No level skipping support yet */
		if (PM_PTE_LEVEL(*pte) != level)
			return NULL;

1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278
		level -= 1;

		pte = IOMMU_PTE_PAGE(*pte);

		if (pte_page && level == end_lvl)
			*pte_page = pte;

		pte = &pte[PM_LEVEL_INDEX(level, address)];
	}

	return pte;
}

/*
 * This function checks if there is a PTE for a given dma address. If
 * there is one, it returns the pointer to it.
 */
1279 1280 1281
static u64 *fetch_pte(struct protection_domain *domain,
		      unsigned long address,
		      unsigned long *page_size)
1282 1283 1284 1285
{
	int level;
	u64 *pte;

1286 1287 1288
	if (address > PM_LEVEL_SIZE(domain->mode))
		return NULL;

1289 1290 1291
	level	   =  domain->mode - 1;
	pte	   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
	*page_size =  PTE_LEVEL_PAGE_SIZE(level);
1292

1293 1294 1295
	while (level > 0) {

		/* Not Present */
1296 1297 1298
		if (!IOMMU_PTE_PRESENT(*pte))
			return NULL;

1299
		/* Large PTE */
1300 1301 1302
		if (PM_PTE_LEVEL(*pte) == 7 ||
		    PM_PTE_LEVEL(*pte) == 0)
			break;
1303 1304 1305 1306 1307

		/* No level skipping support yet */
		if (PM_PTE_LEVEL(*pte) != level)
			return NULL;

1308 1309
		level -= 1;

1310
		/* Walk to the next level */
1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325
		pte	   = IOMMU_PTE_PAGE(*pte);
		pte	   = &pte[PM_LEVEL_INDEX(level, address)];
		*page_size = PTE_LEVEL_PAGE_SIZE(level);
	}

	if (PM_PTE_LEVEL(*pte) == 0x07) {
		unsigned long pte_mask;

		/*
		 * If we have a series of large PTEs, make
		 * sure to return a pointer to the first one.
		 */
		*page_size = pte_mask = PTE_PAGE_SIZE(*pte);
		pte_mask   = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
		pte        = (u64 *)(((unsigned long)pte) & pte_mask);
1326 1327 1328 1329 1330
	}

	return pte;
}

1331 1332 1333 1334 1335 1336 1337
/*
 * Generic mapping functions. It maps a physical address into a DMA
 * address space. It allocates the page table pages if necessary.
 * In the future it can be extended to a generic mapping function
 * supporting all features of AMD IOMMU page tables like level skipping
 * and full 64 bit address spaces.
 */
1338 1339 1340
static int iommu_map_page(struct protection_domain *dom,
			  unsigned long bus_addr,
			  unsigned long phys_addr,
1341
			  unsigned long page_size,
1342
			  int prot,
1343
			  gfp_t gfp)
1344
{
1345
	u64 __pte, *pte;
1346
	int i, count;
1347

1348 1349 1350
	BUG_ON(!IS_ALIGNED(bus_addr, page_size));
	BUG_ON(!IS_ALIGNED(phys_addr, page_size));

1351
	if (!(prot & IOMMU_PROT_MASK))
1352 1353
		return -EINVAL;

1354
	count = PAGE_SIZE_PTE_COUNT(page_size);
1355
	pte   = alloc_pte(dom, bus_addr, page_size, NULL, gfp);
1356

1357 1358 1359
	if (!pte)
		return -ENOMEM;

1360 1361 1362
	for (i = 0; i < count; ++i)
		if (IOMMU_PTE_PRESENT(pte[i]))
			return -EBUSY;
1363

1364
	if (count > 1) {
1365 1366 1367 1368
		__pte = PAGE_SIZE_PTE(phys_addr, page_size);
		__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
	} else
		__pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
1369 1370 1371 1372 1373 1374

	if (prot & IOMMU_PROT_IR)
		__pte |= IOMMU_PTE_IR;
	if (prot & IOMMU_PROT_IW)
		__pte |= IOMMU_PTE_IW;

1375 1376
	for (i = 0; i < count; ++i)
		pte[i] = __pte;
1377

1378 1379
	update_domain(dom);

1380 1381 1382
	return 0;
}

1383 1384 1385
static unsigned long iommu_unmap_page(struct protection_domain *dom,
				      unsigned long bus_addr,
				      unsigned long page_size)
1386
{
1387 1388
	unsigned long long unmapped;
	unsigned long unmap_size;
1389 1390 1391 1392 1393
	u64 *pte;

	BUG_ON(!is_power_of_2(page_size));

	unmapped = 0;
1394

1395 1396
	while (unmapped < page_size) {

1397 1398 1399 1400 1401 1402
		pte = fetch_pte(dom, bus_addr, &unmap_size);

		if (pte) {
			int i, count;

			count = PAGE_SIZE_PTE_COUNT(unmap_size);
1403 1404 1405 1406 1407 1408 1409 1410
			for (i = 0; i < count; i++)
				pte[i] = 0ULL;
		}

		bus_addr  = (bus_addr & ~(unmap_size - 1)) + unmap_size;
		unmapped += unmap_size;
	}

1411
	BUG_ON(unmapped && !is_power_of_2(unmapped));
1412

1413
	return unmapped;
1414 1415
}

1416 1417 1418
/****************************************************************************
 *
 * The next functions belong to the address allocator for the dma_ops
1419
 * interface functions.
1420 1421
 *
 ****************************************************************************/
1422

1423

1424 1425 1426
static unsigned long dma_ops_alloc_iova(struct device *dev,
					struct dma_ops_domain *dma_dom,
					unsigned int pages, u64 dma_mask)
1427
{
1428
	unsigned long pfn = 0;
1429

1430
	pages = __roundup_pow_of_two(pages);
1431

1432 1433 1434
	if (dma_mask > DMA_BIT_MASK(32))
		pfn = alloc_iova_fast(&dma_dom->iovad, pages,
				      IOVA_PFN(DMA_BIT_MASK(32)));
1435

1436 1437
	if (!pfn)
		pfn = alloc_iova_fast(&dma_dom->iovad, pages, IOVA_PFN(dma_mask));
1438

1439
	return (pfn << PAGE_SHIFT);
1440 1441
}

1442 1443 1444
static void dma_ops_free_iova(struct dma_ops_domain *dma_dom,
			      unsigned long address,
			      unsigned int pages)
1445
{
1446 1447
	pages = __roundup_pow_of_two(pages);
	address >>= PAGE_SHIFT;
1448

1449
	free_iova_fast(&dma_dom->iovad, address, pages);
1450 1451
}

1452 1453 1454 1455 1456 1457 1458 1459 1460 1461
/****************************************************************************
 *
 * The next functions belong to the domain allocation. A domain is
 * allocated for every IOMMU as the default domain. If device isolation
 * is enabled, every device get its own domain. The most important thing
 * about domains is the page table mapping the DMA address space they
 * contain.
 *
 ****************************************************************************/

1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486
/*
 * This function adds a protection domain to the global protection domain list
 */
static void add_domain_to_list(struct protection_domain *domain)
{
	unsigned long flags;

	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
	list_add(&domain->list, &amd_iommu_pd_list);
	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
}

/*
 * This function removes a protection domain to the global
 * protection domain list
 */
static void del_domain_from_list(struct protection_domain *domain)
{
	unsigned long flags;

	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
	list_del(&domain->list);
	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
}

1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503
static u16 domain_id_alloc(void)
{
	unsigned long flags;
	int id;

	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
	BUG_ON(id == 0);
	if (id > 0 && id < MAX_DOMAIN_ID)
		__set_bit(id, amd_iommu_pd_alloc_bitmap);
	else
		id = 0;
	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);

	return id;
}

1504 1505 1506 1507 1508 1509 1510 1511 1512 1513
static void domain_id_free(int id)
{
	unsigned long flags;

	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
	if (id > 0 && id < MAX_DOMAIN_ID)
		__clear_bit(id, amd_iommu_pd_alloc_bitmap);
	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
}

1514 1515 1516 1517 1518 1519 1520 1521 1522 1523
#define DEFINE_FREE_PT_FN(LVL, FN)				\
static void free_pt_##LVL (unsigned long __pt)			\
{								\
	unsigned long p;					\
	u64 *pt;						\
	int i;							\
								\
	pt = (u64 *)__pt;					\
								\
	for (i = 0; i < 512; ++i) {				\
1524
		/* PTE present? */				\
1525 1526 1527
		if (!IOMMU_PTE_PRESENT(pt[i]))			\
			continue;				\
								\
1528 1529 1530 1531 1532
		/* Large PTE? */				\
		if (PM_PTE_LEVEL(pt[i]) == 0 ||			\
		    PM_PTE_LEVEL(pt[i]) == 7)			\
			continue;				\
								\
1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544
		p = (unsigned long)IOMMU_PTE_PAGE(pt[i]);	\
		FN(p);						\
	}							\
	free_page((unsigned long)pt);				\
}

DEFINE_FREE_PT_FN(l2, free_page)
DEFINE_FREE_PT_FN(l3, free_pt_l2)
DEFINE_FREE_PT_FN(l4, free_pt_l3)
DEFINE_FREE_PT_FN(l5, free_pt_l4)
DEFINE_FREE_PT_FN(l6, free_pt_l5)

1545
static void free_pagetable(struct protection_domain *domain)
1546
{
1547
	unsigned long root = (unsigned long)domain->pt_root;
1548

1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571
	switch (domain->mode) {
	case PAGE_MODE_NONE:
		break;
	case PAGE_MODE_1_LEVEL:
		free_page(root);
		break;
	case PAGE_MODE_2_LEVEL:
		free_pt_l2(root);
		break;
	case PAGE_MODE_3_LEVEL:
		free_pt_l3(root);
		break;
	case PAGE_MODE_4_LEVEL:
		free_pt_l4(root);
		break;
	case PAGE_MODE_5_LEVEL:
		free_pt_l5(root);
		break;
	case PAGE_MODE_6_LEVEL:
		free_pt_l6(root);
		break;
	default:
		BUG();
1572 1573 1574
	}
}

1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604
static void free_gcr3_tbl_level1(u64 *tbl)
{
	u64 *ptr;
	int i;

	for (i = 0; i < 512; ++i) {
		if (!(tbl[i] & GCR3_VALID))
			continue;

		ptr = __va(tbl[i] & PAGE_MASK);

		free_page((unsigned long)ptr);
	}
}

static void free_gcr3_tbl_level2(u64 *tbl)
{
	u64 *ptr;
	int i;

	for (i = 0; i < 512; ++i) {
		if (!(tbl[i] & GCR3_VALID))
			continue;

		ptr = __va(tbl[i] & PAGE_MASK);

		free_gcr3_tbl_level1(ptr);
	}
}

1605 1606
static void free_gcr3_table(struct protection_domain *domain)
{
1607 1608 1609 1610
	if (domain->glx == 2)
		free_gcr3_tbl_level2(domain->gcr3_tbl);
	else if (domain->glx == 1)
		free_gcr3_tbl_level1(domain->gcr3_tbl);
1611 1612
	else
		BUG_ON(domain->glx != 0);
1613

1614 1615 1616
	free_page((unsigned long)domain->gcr3_tbl);
}

1617 1618 1619 1620
/*
 * Free a domain, only used if something went wrong in the
 * allocation path and we need to free an already allocated page table
 */
1621 1622 1623 1624 1625
static void dma_ops_domain_free(struct dma_ops_domain *dom)
{
	if (!dom)
		return;

1626 1627
	del_domain_from_list(&dom->domain);

1628
	put_iova_domain(&dom->iovad);
1629

1630
	free_pagetable(&dom->domain);
1631 1632 1633 1634

	kfree(dom);
}

1635 1636
/*
 * Allocates a new protection domain usable for the dma_ops functions.
1637
 * It also initializes the page table and the address allocator data
1638 1639
 * structures required for the dma_ops interface
 */
1640
static struct dma_ops_domain *dma_ops_domain_alloc(void)
1641 1642 1643 1644 1645 1646 1647
{
	struct dma_ops_domain *dma_dom;

	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
	if (!dma_dom)
		return NULL;

1648
	if (protection_domain_init(&dma_dom->domain))
1649
		goto free_dma_dom;
1650

1651
	dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1652
	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1653
	dma_dom->domain.flags = PD_DMA_OPS_MASK;
1654 1655 1656 1657
	dma_dom->domain.priv = dma_dom;
	if (!dma_dom->domain.pt_root)
		goto free_dma_dom;

1658 1659 1660
	init_iova_domain(&dma_dom->iovad, PAGE_SIZE,
			 IOVA_START_PFN, DMA_32BIT_PFN);

1661 1662 1663
	/* Initialize reserved ranges */
	copy_reserved_iova(&reserved_iova_ranges, &dma_dom->iovad);

1664 1665
	add_domain_to_list(&dma_dom->domain);

1666 1667 1668 1669 1670 1671 1672 1673
	return dma_dom;

free_dma_dom:
	dma_ops_domain_free(dma_dom);

	return NULL;
}

1674 1675 1676 1677 1678 1679 1680 1681 1682
/*
 * little helper function to check whether a given protection domain is a
 * dma_ops domain
 */
static bool dma_ops_domain(struct protection_domain *domain)
{
	return domain->flags & PD_DMA_OPS_MASK;
}

1683
static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
1684
{
1685
	u64 pte_root = 0;
1686
	u64 flags = 0;
1687

1688 1689 1690
	if (domain->mode != PAGE_MODE_NONE)
		pte_root = virt_to_phys(domain->pt_root);

1691 1692 1693
	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
		    << DEV_ENTRY_MODE_SHIFT;
	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1694

1695 1696
	flags = amd_iommu_dev_table[devid].data[1];

1697 1698 1699
	if (ats)
		flags |= DTE_FLAG_IOTLB;

1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725
	if (domain->flags & PD_IOMMUV2_MASK) {
		u64 gcr3 = __pa(domain->gcr3_tbl);
		u64 glx  = domain->glx;
		u64 tmp;

		pte_root |= DTE_FLAG_GV;
		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;

		/* First mask out possible old values for GCR3 table */
		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
		flags    &= ~tmp;

		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
		flags    &= ~tmp;

		/* Encode GCR3 table into DTE */
		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
		pte_root |= tmp;

		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
		flags    |= tmp;

		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
		flags    |= tmp;
	}

1726 1727 1728 1729 1730
	flags &= ~(0xffffUL);
	flags |= domain->id;

	amd_iommu_dev_table[devid].data[1]  = flags;
	amd_iommu_dev_table[devid].data[0]  = pte_root;
1731 1732 1733 1734 1735
}

static void clear_dte_entry(u16 devid)
{
	/* remove entry from the device table seen by the hardware */
1736 1737
	amd_iommu_dev_table[devid].data[0]  = IOMMU_PTE_P | IOMMU_PTE_TV;
	amd_iommu_dev_table[devid].data[1] &= DTE_FLAG_MASK;
1738 1739

	amd_iommu_apply_erratum_63(devid);
1740 1741
}

1742 1743
static void do_attach(struct iommu_dev_data *dev_data,
		      struct protection_domain *domain)
1744 1745
{
	struct amd_iommu *iommu;
1746
	u16 alias;
1747
	bool ats;
1748

1749
	iommu = amd_iommu_rlookup_table[dev_data->devid];
1750
	alias = dev_data->alias;
1751
	ats   = dev_data->ats.enabled;
1752 1753 1754 1755 1756 1757 1758 1759 1760

	/* Update data structures */
	dev_data->domain = domain;
	list_add(&dev_data->list, &domain->dev_list);

	/* Do reference counting */
	domain->dev_iommu[iommu->index] += 1;
	domain->dev_cnt                 += 1;

1761 1762 1763
	/* Update device table */
	set_dte_entry(dev_data->devid, domain, ats);
	if (alias != dev_data->devid)
1764
		set_dte_entry(alias, domain, ats);
1765

1766
	device_flush_dte(dev_data);
1767 1768
}

1769
static void do_detach(struct iommu_dev_data *dev_data)
1770 1771
{
	struct amd_iommu *iommu;
1772
	u16 alias;
1773

1774 1775 1776 1777 1778 1779 1780 1781 1782
	/*
	 * First check if the device is still attached. It might already
	 * be detached from its domain because the generic
	 * iommu_detach_group code detached it and we try again here in
	 * our alias handling.
	 */
	if (!dev_data->domain)
		return;

1783
	iommu = amd_iommu_rlookup_table[dev_data->devid];
1784
	alias = dev_data->alias;
1785 1786

	/* decrease reference counters */
1787 1788 1789 1790 1791 1792
	dev_data->domain->dev_iommu[iommu->index] -= 1;
	dev_data->domain->dev_cnt                 -= 1;

	/* Update data structures */
	dev_data->domain = NULL;
	list_del(&dev_data->list);
1793
	clear_dte_entry(dev_data->devid);
1794 1795
	if (alias != dev_data->devid)
		clear_dte_entry(alias);
1796

1797
	/* Flush the DTE entry */
1798
	device_flush_dte(dev_data);
1799 1800 1801 1802 1803 1804
}

/*
 * If a device is not yet associated with a domain, this function does
 * assigns it visible for the hardware
 */
1805
static int __attach_device(struct iommu_dev_data *dev_data,
1806
			   struct protection_domain *domain)
1807
{
1808
	int ret;
1809

1810 1811 1812 1813 1814 1815
	/*
	 * Must be called with IRQs disabled. Warn here to detect early
	 * when its not.
	 */
	WARN_ON(!irqs_disabled());

1816 1817 1818
	/* lock domain */
	spin_lock(&domain->lock);

1819
	ret = -EBUSY;
1820
	if (dev_data->domain != NULL)
1821
		goto out_unlock;
1822

1823
	/* Attach alias group root */
1824
	do_attach(dev_data, domain);
1825

1826 1827 1828 1829
	ret = 0;

out_unlock:

1830 1831
	/* ready */
	spin_unlock(&domain->lock);
1832

1833
	return ret;
1834
}
1835

1836 1837 1838 1839 1840 1841 1842 1843

static void pdev_iommuv2_disable(struct pci_dev *pdev)
{
	pci_disable_ats(pdev);
	pci_disable_pri(pdev);
	pci_disable_pasid(pdev);
}

1844 1845 1846 1847 1848 1849
/* FIXME: Change generic reset-function to do the same */
static int pri_reset_while_enabled(struct pci_dev *pdev)
{
	u16 control;
	int pos;

1850
	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
1851 1852 1853
	if (!pos)
		return -EINVAL;

1854 1855 1856
	pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
	control |= PCI_PRI_CTRL_RESET;
	pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
1857 1858 1859 1860

	return 0;
}

1861 1862
static int pdev_iommuv2_enable(struct pci_dev *pdev)
{
1863 1864 1865 1866 1867 1868 1869 1870
	bool reset_enable;
	int reqs, ret;

	/* FIXME: Hardcode number of outstanding requests for now */
	reqs = 32;
	if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE))
		reqs = 1;
	reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET);
1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881

	/* Only allow access to user-accessible pages */
	ret = pci_enable_pasid(pdev, 0);
	if (ret)
		goto out_err;

	/* First reset the PRI state of the device */
	ret = pci_reset_pri(pdev);
	if (ret)
		goto out_err;

1882 1883
	/* Enable PRI */
	ret = pci_enable_pri(pdev, reqs);
1884 1885 1886
	if (ret)
		goto out_err;

1887 1888 1889 1890 1891 1892
	if (reset_enable) {
		ret = pri_reset_while_enabled(pdev);
		if (ret)
			goto out_err;
	}

1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905
	ret = pci_enable_ats(pdev, PAGE_SHIFT);
	if (ret)
		goto out_err;

	return 0;

out_err:
	pci_disable_pri(pdev);
	pci_disable_pasid(pdev);

	return ret;
}

1906
/* FIXME: Move this to PCI code */
1907
#define PCI_PRI_TLP_OFF		(1 << 15)
1908

J
Joerg Roedel 已提交
1909
static bool pci_pri_tlp_required(struct pci_dev *pdev)
1910
{
1911
	u16 status;
1912 1913
	int pos;

1914
	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
1915 1916 1917
	if (!pos)
		return false;

1918
	pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
1919

1920
	return (status & PCI_PRI_TLP_OFF) ? true : false;
1921 1922
}

1923
/*
F
Frank Arnold 已提交
1924
 * If a device is not yet associated with a domain, this function
1925 1926
 * assigns it visible for the hardware
 */
1927 1928
static int attach_device(struct device *dev,
			 struct protection_domain *domain)
1929
{
1930
	struct pci_dev *pdev;
1931
	struct iommu_dev_data *dev_data;
1932
	unsigned long flags;
1933
	int ret;
1934

1935 1936
	dev_data = get_dev_data(dev);

1937 1938 1939 1940
	if (!dev_is_pci(dev))
		goto skip_ats_check;

	pdev = to_pci_dev(dev);
1941
	if (domain->flags & PD_IOMMUV2_MASK) {
1942
		if (!dev_data->passthrough)
1943 1944
			return -EINVAL;

1945 1946 1947
		if (dev_data->iommu_v2) {
			if (pdev_iommuv2_enable(pdev) != 0)
				return -EINVAL;
1948

1949 1950 1951 1952
			dev_data->ats.enabled = true;
			dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
			dev_data->pri_tlp     = pci_pri_tlp_required(pdev);
		}
1953 1954
	} else if (amd_iommu_iotlb_sup &&
		   pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
1955 1956 1957
		dev_data->ats.enabled = true;
		dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
	}
1958

1959
skip_ats_check:
1960
	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1961
	ret = __attach_device(dev_data, domain);
1962 1963
	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);

1964 1965 1966 1967 1968
	/*
	 * We might boot into a crash-kernel here. The crashed kernel
	 * left the caches in the IOMMU dirty. So we have to flush
	 * here to evict all dirty stuff.
	 */
1969
	domain_flush_tlb_pde(domain);
1970 1971

	return ret;
1972 1973
}

1974 1975 1976
/*
 * Removes a device from a protection domain (unlocked)
 */
1977
static void __detach_device(struct iommu_dev_data *dev_data)
1978
{
1979
	struct protection_domain *domain;
1980

1981 1982 1983 1984 1985
	/*
	 * Must be called with IRQs disabled. Warn here to detect early
	 * when its not.
	 */
	WARN_ON(!irqs_disabled());
1986

1987 1988
	if (WARN_ON(!dev_data->domain))
		return;
1989

1990
	domain = dev_data->domain;
1991

1992
	spin_lock(&domain->lock);
1993

1994
	do_detach(dev_data);
1995

1996
	spin_unlock(&domain->lock);
1997 1998 1999 2000 2001
}

/*
 * Removes a device from a protection domain (with devtable_lock held)
 */
2002
static void detach_device(struct device *dev)
2003
{
2004
	struct protection_domain *domain;
2005
	struct iommu_dev_data *dev_data;
2006 2007
	unsigned long flags;

2008
	dev_data = get_dev_data(dev);
2009
	domain   = dev_data->domain;
2010

2011 2012
	/* lock device table */
	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2013
	__detach_device(dev_data);
2014
	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2015

2016 2017 2018
	if (!dev_is_pci(dev))
		return;

2019
	if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2)
2020 2021
		pdev_iommuv2_disable(to_pci_dev(dev));
	else if (dev_data->ats.enabled)
2022
		pci_disable_ats(to_pci_dev(dev));
2023 2024

	dev_data->ats.enabled = false;
2025
}
2026

2027
static int amd_iommu_add_device(struct device *dev)
2028
{
2029
	struct iommu_dev_data *dev_data;
2030
	struct iommu_domain *domain;
2031
	struct amd_iommu *iommu;
2032
	int ret, devid;
2033

2034
	if (!check_device(dev) || get_dev_data(dev))
2035
		return 0;
2036

2037
	devid = get_device_id(dev);
2038
	if (devid < 0)
2039 2040
		return devid;

2041
	iommu = amd_iommu_rlookup_table[devid];
2042

2043
	ret = iommu_init_device(dev);
2044 2045 2046 2047
	if (ret) {
		if (ret != -ENOTSUPP)
			pr_err("Failed to initialize device %s - trying to proceed anyway\n",
				dev_name(dev));
2048

2049
		iommu_ignore_device(dev);
2050
		dev->archdata.dma_ops = &nommu_dma_ops;
2051 2052 2053
		goto out;
	}
	init_iommu_group(dev);
2054

2055
	dev_data = get_dev_data(dev);
2056

2057
	BUG_ON(!dev_data);
2058

2059
	if (iommu_pass_through || dev_data->iommu_v2)
2060
		iommu_request_dm_for_dev(dev);
2061

2062 2063
	/* Domains are initialized for this device - have a look what we ended up with */
	domain = iommu_get_domain_for_dev(dev);
2064
	if (domain->type == IOMMU_DOMAIN_IDENTITY)
2065
		dev_data->passthrough = true;
2066
	else
2067
		dev->archdata.dma_ops = &amd_iommu_dma_ops;
2068

2069
out:
2070 2071 2072 2073 2074
	iommu_completion_wait(iommu);

	return 0;
}

2075
static void amd_iommu_remove_device(struct device *dev)
2076
{
2077
	struct amd_iommu *iommu;
2078
	int devid;
2079 2080 2081 2082 2083

	if (!check_device(dev))
		return;

	devid = get_device_id(dev);
2084
	if (devid < 0)
2085 2086
		return;

2087 2088 2089 2090
	iommu = amd_iommu_rlookup_table[devid];

	iommu_uninit_device(dev);
	iommu_completion_wait(iommu);
2091 2092
}

2093 2094 2095 2096 2097 2098 2099 2100
static struct iommu_group *amd_iommu_device_group(struct device *dev)
{
	if (dev_is_pci(dev))
		return pci_device_group(dev);

	return acpihid_device_group(dev);
}

2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113
/*****************************************************************************
 *
 * The next functions belong to the dma_ops mapping/unmapping code.
 *
 *****************************************************************************/

/*
 * In the dma_ops path we only have the struct device. This function
 * finds the corresponding IOMMU, the protection domain and the
 * requestor id for a given device.
 * If the device is not yet associated with a domain this is also done
 * in this function.
 */
2114
static struct protection_domain *get_domain(struct device *dev)
2115
{
2116
	struct protection_domain *domain;
2117
	struct iommu_domain *io_domain;
2118

2119
	if (!check_device(dev))
2120
		return ERR_PTR(-EINVAL);
2121

2122
	io_domain = iommu_get_domain_for_dev(dev);
2123 2124
	if (!io_domain)
		return NULL;
2125

2126 2127
	domain = to_pdomain(io_domain);
	if (!dma_ops_domain(domain))
2128
		return ERR_PTR(-EBUSY);
2129

2130
	return domain;
2131 2132
}

2133 2134
static void update_device_table(struct protection_domain *domain)
{
2135
	struct iommu_dev_data *dev_data;
2136

2137 2138
	list_for_each_entry(dev_data, &domain->dev_list, list)
		set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled);
2139 2140 2141 2142 2143 2144 2145 2146
}

static void update_domain(struct protection_domain *domain)
{
	if (!domain->updated)
		return;

	update_device_table(domain);
2147 2148 2149

	domain_flush_devices(domain);
	domain_flush_tlb_pde(domain);
2150 2151 2152 2153

	domain->updated = false;
}

2154 2155
/*
 * This function contains common code for mapping of a physically
J
Joerg Roedel 已提交
2156 2157
 * contiguous memory region into DMA address space. It is used by all
 * mapping functions provided with this IOMMU driver.
2158 2159
 * Must be called with the domain lock held.
 */
2160 2161 2162 2163
static dma_addr_t __map_single(struct device *dev,
			       struct dma_ops_domain *dma_dom,
			       phys_addr_t paddr,
			       size_t size,
2164
			       int direction,
2165 2166
			       bool align,
			       u64 dma_mask)
2167 2168
{
	dma_addr_t offset = paddr & ~PAGE_MASK;
2169
	dma_addr_t address, start, ret;
2170
	unsigned int pages;
2171
	unsigned long align_mask = 0;
2172
	int prot = 0;
2173 2174
	int i;

2175
	pages = iommu_num_pages(paddr, size, PAGE_SIZE);
2176 2177
	paddr &= PAGE_MASK;

2178 2179 2180
	if (align)
		align_mask = (1UL << get_order(size)) - 1;

2181
	address = dma_ops_alloc_iova(dev, dma_dom, pages, dma_mask);
2182 2183
	if (address == DMA_ERROR_CODE)
		goto out;
2184

2185 2186 2187 2188 2189 2190 2191
	if (direction == DMA_TO_DEVICE)
		prot = IOMMU_PROT_IR;
	else if (direction == DMA_FROM_DEVICE)
		prot = IOMMU_PROT_IW;
	else if (direction == DMA_BIDIRECTIONAL)
		prot = IOMMU_PROT_IW | IOMMU_PROT_IR;

2192 2193
	start = address;
	for (i = 0; i < pages; ++i) {
2194 2195 2196
		ret = iommu_map_page(&dma_dom->domain, start, paddr,
				     PAGE_SIZE, prot, GFP_ATOMIC);
		if (ret)
2197 2198
			goto out_unmap;

2199 2200 2201 2202 2203
		paddr += PAGE_SIZE;
		start += PAGE_SIZE;
	}
	address += offset;

2204
	if (unlikely(amd_iommu_np_cache)) {
2205
		domain_flush_pages(&dma_dom->domain, address, size);
2206 2207
		domain_flush_complete(&dma_dom->domain);
	}
2208

2209 2210
out:
	return address;
2211 2212 2213 2214 2215

out_unmap:

	for (--i; i >= 0; --i) {
		start -= PAGE_SIZE;
2216
		iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
2217 2218
	}

2219 2220 2221 2222
	domain_flush_tlb(&dma_dom->domain);
	domain_flush_complete(&dma_dom->domain);

	dma_ops_free_iova(dma_dom, address, pages);
2223

2224
	return DMA_ERROR_CODE;
2225 2226
}

2227 2228 2229 2230
/*
 * Does the reverse of the __map_single function. Must be called with
 * the domain lock held too
 */
2231
static void __unmap_single(struct dma_ops_domain *dma_dom,
2232 2233 2234 2235
			   dma_addr_t dma_addr,
			   size_t size,
			   int dir)
{
2236
	dma_addr_t flush_addr;
2237 2238 2239
	dma_addr_t i, start;
	unsigned int pages;

2240
	flush_addr = dma_addr;
2241
	pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
2242 2243 2244 2245
	dma_addr &= PAGE_MASK;
	start = dma_addr;

	for (i = 0; i < pages; ++i) {
2246
		iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
2247 2248 2249
		start += PAGE_SIZE;
	}

2250 2251 2252 2253
	domain_flush_tlb(&dma_dom->domain);
	domain_flush_complete(&dma_dom->domain);

	dma_ops_free_iova(dma_dom, dma_addr, pages);
2254 2255
}

2256 2257 2258
/*
 * The exported map_single function for dma_ops.
 */
2259 2260 2261 2262
static dma_addr_t map_page(struct device *dev, struct page *page,
			   unsigned long offset, size_t size,
			   enum dma_data_direction dir,
			   struct dma_attrs *attrs)
2263
{
2264
	phys_addr_t paddr = page_to_phys(page) + offset;
2265
	struct protection_domain *domain;
2266
	u64 dma_mask;
2267

2268 2269
	domain = get_domain(dev);
	if (PTR_ERR(domain) == -EINVAL)
2270
		return (dma_addr_t)paddr;
2271 2272
	else if (IS_ERR(domain))
		return DMA_ERROR_CODE;
2273

2274 2275
	dma_mask = *dev->dma_mask;

2276
	return __map_single(dev, domain->priv, paddr, size, dir, false,
2277
			    dma_mask);
2278 2279
}

2280 2281 2282
/*
 * The exported unmap_single function for dma_ops.
 */
2283 2284
static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
		       enum dma_data_direction dir, struct dma_attrs *attrs)
2285 2286 2287
{
	struct protection_domain *domain;

2288 2289
	domain = get_domain(dev);
	if (IS_ERR(domain))
2290 2291
		return;

2292
	__unmap_single(domain->priv, dma_addr, size, dir);
2293 2294
}

2295 2296 2297 2298
/*
 * The exported map_sg function for dma_ops (handles scatter-gather
 * lists).
 */
2299
static int map_sg(struct device *dev, struct scatterlist *sglist,
2300 2301
		  int nelems, enum dma_data_direction dir,
		  struct dma_attrs *attrs)
2302 2303 2304 2305 2306 2307
{
	struct protection_domain *domain;
	int i;
	struct scatterlist *s;
	phys_addr_t paddr;
	int mapped_elems = 0;
2308
	u64 dma_mask;
2309

2310
	domain = get_domain(dev);
2311
	if (IS_ERR(domain))
2312
		return 0;
2313

2314
	dma_mask = *dev->dma_mask;
2315 2316 2317 2318

	for_each_sg(sglist, s, nelems, i) {
		paddr = sg_phys(s);

2319
		s->dma_address = __map_single(dev, domain->priv,
2320 2321
					      paddr, s->length, dir, false,
					      dma_mask);
2322 2323 2324 2325 2326 2327 2328 2329 2330

		if (s->dma_address) {
			s->dma_length = s->length;
			mapped_elems++;
		} else
			goto unmap;
	}

	return mapped_elems;
2331

2332 2333 2334
unmap:
	for_each_sg(sglist, s, mapped_elems, i) {
		if (s->dma_address)
2335
			__unmap_single(domain->priv, s->dma_address,
2336 2337 2338 2339
				       s->dma_length, dir);
		s->dma_address = s->dma_length = 0;
	}

2340
	return 0;
2341 2342
}

2343 2344 2345 2346
/*
 * The exported map_sg function for dma_ops (handles scatter-gather
 * lists).
 */
2347
static void unmap_sg(struct device *dev, struct scatterlist *sglist,
2348 2349
		     int nelems, enum dma_data_direction dir,
		     struct dma_attrs *attrs)
2350 2351 2352 2353 2354
{
	struct protection_domain *domain;
	struct scatterlist *s;
	int i;

2355 2356
	domain = get_domain(dev);
	if (IS_ERR(domain))
2357 2358
		return;

2359
	for_each_sg(sglist, s, nelems, i) {
2360
		__unmap_single(domain->priv, s->dma_address,
2361 2362 2363 2364 2365
			       s->dma_length, dir);
		s->dma_address = s->dma_length = 0;
	}
}

2366 2367 2368
/*
 * The exported alloc_coherent function for dma_ops.
 */
2369
static void *alloc_coherent(struct device *dev, size_t size,
2370 2371
			    dma_addr_t *dma_addr, gfp_t flag,
			    struct dma_attrs *attrs)
2372
{
2373
	u64 dma_mask = dev->coherent_dma_mask;
2374 2375
	struct protection_domain *domain;
	struct page *page;
2376

2377 2378
	domain = get_domain(dev);
	if (PTR_ERR(domain) == -EINVAL) {
2379 2380 2381
		page = alloc_pages(flag, get_order(size));
		*dma_addr = page_to_phys(page);
		return page_address(page);
2382 2383
	} else if (IS_ERR(domain))
		return NULL;
2384

2385
	size	  = PAGE_ALIGN(size);
2386 2387
	dma_mask  = dev->coherent_dma_mask;
	flag     &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
2388
	flag     |= __GFP_ZERO;
2389

2390 2391
	page = alloc_pages(flag | __GFP_NOWARN,  get_order(size));
	if (!page) {
2392
		if (!gfpflags_allow_blocking(flag))
2393
			return NULL;
2394

2395 2396 2397 2398 2399
		page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
						 get_order(size));
		if (!page)
			return NULL;
	}
2400

2401 2402 2403
	if (!dma_mask)
		dma_mask = *dev->dma_mask;

2404
	*dma_addr = __map_single(dev, domain->priv, page_to_phys(page),
2405
				 size, DMA_BIDIRECTIONAL, true, dma_mask);
2406

2407
	if (*dma_addr == DMA_ERROR_CODE)
2408
		goto out_free;
2409

2410
	return page_address(page);
2411 2412 2413

out_free:

2414 2415
	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
		__free_pages(page, get_order(size));
2416 2417

	return NULL;
2418 2419
}

2420 2421 2422
/*
 * The exported free_coherent function for dma_ops.
 */
2423
static void free_coherent(struct device *dev, size_t size,
2424 2425
			  void *virt_addr, dma_addr_t dma_addr,
			  struct dma_attrs *attrs)
2426 2427
{
	struct protection_domain *domain;
2428
	struct page *page;
2429

2430 2431 2432
	page = virt_to_page(virt_addr);
	size = PAGE_ALIGN(size);

2433 2434
	domain = get_domain(dev);
	if (IS_ERR(domain))
2435 2436
		goto free_mem;

2437
	__unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2438 2439

free_mem:
2440 2441
	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
		__free_pages(page, get_order(size));
2442 2443
}

2444 2445 2446 2447 2448 2449
/*
 * This function is called by the DMA layer to find out if we can handle a
 * particular device. It is part of the dma_ops.
 */
static int amd_iommu_dma_supported(struct device *dev, u64 mask)
{
2450
	return check_device(dev);
2451 2452
}

2453
static struct dma_map_ops amd_iommu_dma_ops = {
2454 2455 2456 2457 2458 2459 2460
	.alloc		= alloc_coherent,
	.free		= free_coherent,
	.map_page	= map_page,
	.unmap_page	= unmap_page,
	.map_sg		= map_sg,
	.unmap_sg	= unmap_sg,
	.dma_supported	= amd_iommu_dma_supported,
2461 2462
};

2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515
static int init_reserved_iova_ranges(void)
{
	struct pci_dev *pdev = NULL;
	struct iova *val;

	init_iova_domain(&reserved_iova_ranges, PAGE_SIZE,
			 IOVA_START_PFN, DMA_32BIT_PFN);

	lockdep_set_class(&reserved_iova_ranges.iova_rbtree_lock,
			  &reserved_rbtree_key);

	/* MSI memory range */
	val = reserve_iova(&reserved_iova_ranges,
			   IOVA_PFN(MSI_RANGE_START), IOVA_PFN(MSI_RANGE_END));
	if (!val) {
		pr_err("Reserving MSI range failed\n");
		return -ENOMEM;
	}

	/* HT memory range */
	val = reserve_iova(&reserved_iova_ranges,
			   IOVA_PFN(HT_RANGE_START), IOVA_PFN(HT_RANGE_END));
	if (!val) {
		pr_err("Reserving HT range failed\n");
		return -ENOMEM;
	}

	/*
	 * Memory used for PCI resources
	 * FIXME: Check whether we can reserve the PCI-hole completly
	 */
	for_each_pci_dev(pdev) {
		int i;

		for (i = 0; i < PCI_NUM_RESOURCES; ++i) {
			struct resource *r = &pdev->resource[i];

			if (!(r->flags & IORESOURCE_MEM))
				continue;

			val = reserve_iova(&reserved_iova_ranges,
					   IOVA_PFN(r->start),
					   IOVA_PFN(r->end));
			if (!val) {
				pr_err("Reserve pci-resource range failed\n");
				return -ENOMEM;
			}
		}
	}

	return 0;
}

2516
int __init amd_iommu_init_api(void)
2517
{
2518 2519 2520 2521 2522
	int ret, err = 0;

	ret = iova_cache_get();
	if (ret)
		return ret;
2523

2524 2525 2526 2527
	ret = init_reserved_iova_ranges();
	if (ret)
		return ret;

2528 2529 2530 2531 2532 2533 2534 2535
	err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
	if (err)
		return err;
#ifdef CONFIG_ARM_AMBA
	err = bus_set_iommu(&amba_bustype, &amd_iommu_ops);
	if (err)
		return err;
#endif
2536 2537 2538
	err = bus_set_iommu(&platform_bus_type, &amd_iommu_ops);
	if (err)
		return err;
2539
	return 0;
2540 2541
}

2542 2543
int __init amd_iommu_init_dma_ops(void)
{
2544
	swiotlb        = iommu_pass_through ? 1 : 0;
2545 2546
	iommu_detected = 1;

2547 2548 2549 2550 2551 2552 2553 2554 2555
	/*
	 * In case we don't initialize SWIOTLB (actually the common case
	 * when AMD IOMMU is enabled), make sure there are global
	 * dma_ops set as a fall-back for devices not handled by this
	 * driver (for example non-PCI devices).
	 */
	if (!swiotlb)
		dma_ops = &nommu_dma_ops;

2556 2557 2558 2559 2560
	if (amd_iommu_unmap_flush)
		pr_info("AMD-Vi: IO/TLB flush on unmap enabled\n");
	else
		pr_info("AMD-Vi: Lazy IO/TLB flushing enabled\n");

2561 2562
	return 0;
}
2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575

/*****************************************************************************
 *
 * The following functions belong to the exported interface of AMD IOMMU
 *
 * This interface allows access to lower level functions of the IOMMU
 * like protection domain handling and assignement of devices to domains
 * which is not possible with the dma_ops interface.
 *
 *****************************************************************************/

static void cleanup_domain(struct protection_domain *domain)
{
2576
	struct iommu_dev_data *entry;
2577 2578 2579 2580
	unsigned long flags;

	write_lock_irqsave(&amd_iommu_devtable_lock, flags);

2581 2582 2583 2584
	while (!list_empty(&domain->dev_list)) {
		entry = list_first_entry(&domain->dev_list,
					 struct iommu_dev_data, list);
		__detach_device(entry);
2585
	}
2586 2587 2588 2589

	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
}

2590 2591 2592 2593 2594
static void protection_domain_free(struct protection_domain *domain)
{
	if (!domain)
		return;

2595 2596
	del_domain_from_list(domain);

2597 2598 2599 2600 2601 2602
	if (domain->id)
		domain_id_free(domain->id);

	kfree(domain);
}

2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614
static int protection_domain_init(struct protection_domain *domain)
{
	spin_lock_init(&domain->lock);
	mutex_init(&domain->api_lock);
	domain->id = domain_id_alloc();
	if (!domain->id)
		return -ENOMEM;
	INIT_LIST_HEAD(&domain->dev_list);

	return 0;
}

2615
static struct protection_domain *protection_domain_alloc(void)
2616 2617 2618 2619 2620
{
	struct protection_domain *domain;

	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
	if (!domain)
2621
		return NULL;
2622

2623
	if (protection_domain_init(domain))
2624 2625
		goto out_err;

2626 2627
	add_domain_to_list(domain);

2628 2629 2630 2631 2632 2633 2634 2635
	return domain;

out_err:
	kfree(domain);

	return NULL;
}

2636
static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
2637
{
2638
	struct protection_domain *pdomain;
2639
	struct dma_ops_domain *dma_domain;
2640

2641 2642 2643 2644 2645
	switch (type) {
	case IOMMU_DOMAIN_UNMANAGED:
		pdomain = protection_domain_alloc();
		if (!pdomain)
			return NULL;
2646

2647 2648 2649 2650 2651 2652
		pdomain->mode    = PAGE_MODE_3_LEVEL;
		pdomain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
		if (!pdomain->pt_root) {
			protection_domain_free(pdomain);
			return NULL;
		}
2653

2654 2655 2656
		pdomain->domain.geometry.aperture_start = 0;
		pdomain->domain.geometry.aperture_end   = ~0ULL;
		pdomain->domain.geometry.force_aperture = true;
2657

2658 2659 2660 2661 2662 2663 2664 2665 2666
		break;
	case IOMMU_DOMAIN_DMA:
		dma_domain = dma_ops_domain_alloc();
		if (!dma_domain) {
			pr_err("AMD-Vi: Failed to allocate\n");
			return NULL;
		}
		pdomain = &dma_domain->domain;
		break;
2667 2668 2669 2670
	case IOMMU_DOMAIN_IDENTITY:
		pdomain = protection_domain_alloc();
		if (!pdomain)
			return NULL;
2671

2672 2673
		pdomain->mode = PAGE_MODE_NONE;
		break;
2674 2675 2676
	default:
		return NULL;
	}
2677

2678
	return &pdomain->domain;
2679 2680
}

2681
static void amd_iommu_domain_free(struct iommu_domain *dom)
2682
{
2683
	struct protection_domain *domain;
2684

2685
	if (!dom)
2686 2687
		return;

2688 2689
	domain = to_pdomain(dom);

2690 2691 2692 2693 2694
	if (domain->dev_cnt > 0)
		cleanup_domain(domain);

	BUG_ON(domain->dev_cnt != 0);

2695 2696
	if (domain->mode != PAGE_MODE_NONE)
		free_pagetable(domain);
2697

2698 2699 2700
	if (domain->flags & PD_IOMMUV2_MASK)
		free_gcr3_table(domain);

2701
	protection_domain_free(domain);
2702 2703
}

2704 2705 2706
static void amd_iommu_detach_device(struct iommu_domain *dom,
				    struct device *dev)
{
2707
	struct iommu_dev_data *dev_data = dev->archdata.iommu;
2708
	struct amd_iommu *iommu;
2709
	int devid;
2710

2711
	if (!check_device(dev))
2712 2713
		return;

2714
	devid = get_device_id(dev);
2715
	if (devid < 0)
2716
		return;
2717

2718
	if (dev_data->domain != NULL)
2719
		detach_device(dev);
2720 2721 2722 2723 2724 2725 2726 2727

	iommu = amd_iommu_rlookup_table[devid];
	if (!iommu)
		return;

	iommu_completion_wait(iommu);
}

2728 2729 2730
static int amd_iommu_attach_device(struct iommu_domain *dom,
				   struct device *dev)
{
2731
	struct protection_domain *domain = to_pdomain(dom);
2732
	struct iommu_dev_data *dev_data;
2733
	struct amd_iommu *iommu;
2734
	int ret;
2735

2736
	if (!check_device(dev))
2737 2738
		return -EINVAL;

2739 2740
	dev_data = dev->archdata.iommu;

2741
	iommu = amd_iommu_rlookup_table[dev_data->devid];
2742 2743 2744
	if (!iommu)
		return -EINVAL;

2745
	if (dev_data->domain)
2746
		detach_device(dev);
2747

2748
	ret = attach_device(dev, domain);
2749 2750 2751

	iommu_completion_wait(iommu);

2752
	return ret;
2753 2754
}

2755
static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
2756
			 phys_addr_t paddr, size_t page_size, int iommu_prot)
2757
{
2758
	struct protection_domain *domain = to_pdomain(dom);
2759 2760 2761
	int prot = 0;
	int ret;

2762 2763 2764
	if (domain->mode == PAGE_MODE_NONE)
		return -EINVAL;

2765 2766 2767 2768 2769
	if (iommu_prot & IOMMU_READ)
		prot |= IOMMU_PROT_IR;
	if (iommu_prot & IOMMU_WRITE)
		prot |= IOMMU_PROT_IW;

2770
	mutex_lock(&domain->api_lock);
2771
	ret = iommu_map_page(domain, iova, paddr, page_size, prot, GFP_KERNEL);
2772 2773
	mutex_unlock(&domain->api_lock);

2774
	return ret;
2775 2776
}

2777 2778
static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
			   size_t page_size)
2779
{
2780
	struct protection_domain *domain = to_pdomain(dom);
2781
	size_t unmap_size;
2782

2783 2784 2785
	if (domain->mode == PAGE_MODE_NONE)
		return -EINVAL;

2786
	mutex_lock(&domain->api_lock);
2787
	unmap_size = iommu_unmap_page(domain, iova, page_size);
2788
	mutex_unlock(&domain->api_lock);
2789

2790
	domain_flush_tlb_pde(domain);
2791

2792
	return unmap_size;
2793 2794
}

2795
static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2796
					  dma_addr_t iova)
2797
{
2798
	struct protection_domain *domain = to_pdomain(dom);
2799
	unsigned long offset_mask, pte_pgsize;
2800
	u64 *pte, __pte;
2801

2802 2803 2804
	if (domain->mode == PAGE_MODE_NONE)
		return iova;

2805
	pte = fetch_pte(domain, iova, &pte_pgsize);
2806

2807
	if (!pte || !IOMMU_PTE_PRESENT(*pte))
2808 2809
		return 0;

2810 2811
	offset_mask = pte_pgsize - 1;
	__pte	    = *pte & PM_ADDR_MASK;
2812

2813
	return (__pte & ~offset_mask) | (iova & offset_mask);
2814 2815
}

2816
static bool amd_iommu_capable(enum iommu_cap cap)
S
Sheng Yang 已提交
2817
{
2818 2819
	switch (cap) {
	case IOMMU_CAP_CACHE_COHERENCY:
2820
		return true;
2821
	case IOMMU_CAP_INTR_REMAP:
2822
		return (irq_remapping_enabled == 1);
2823 2824
	case IOMMU_CAP_NOEXEC:
		return false;
2825 2826
	}

2827
	return false;
S
Sheng Yang 已提交
2828 2829
}

2830 2831 2832 2833
static void amd_iommu_get_dm_regions(struct device *dev,
				     struct list_head *head)
{
	struct unity_map_entry *entry;
2834
	int devid;
2835 2836

	devid = get_device_id(dev);
2837
	if (devid < 0)
2838
		return;
2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872

	list_for_each_entry(entry, &amd_iommu_unity_map, list) {
		struct iommu_dm_region *region;

		if (devid < entry->devid_start || devid > entry->devid_end)
			continue;

		region = kzalloc(sizeof(*region), GFP_KERNEL);
		if (!region) {
			pr_err("Out of memory allocating dm-regions for %s\n",
				dev_name(dev));
			return;
		}

		region->start = entry->address_start;
		region->length = entry->address_end - entry->address_start;
		if (entry->prot & IOMMU_PROT_IR)
			region->prot |= IOMMU_READ;
		if (entry->prot & IOMMU_PROT_IW)
			region->prot |= IOMMU_WRITE;

		list_add_tail(&region->list, head);
	}
}

static void amd_iommu_put_dm_regions(struct device *dev,
				     struct list_head *head)
{
	struct iommu_dm_region *entry, *next;

	list_for_each_entry_safe(entry, next, head, list)
		kfree(entry);
}

2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886
static void amd_iommu_apply_dm_region(struct device *dev,
				      struct iommu_domain *domain,
				      struct iommu_dm_region *region)
{
	struct protection_domain *pdomain = to_pdomain(domain);
	struct dma_ops_domain *dma_dom = pdomain->priv;
	unsigned long start, end;

	start = IOVA_PFN(region->start);
	end   = IOVA_PFN(region->start + region->length);

	WARN_ON_ONCE(reserve_iova(&dma_dom->iovad, start, end) == NULL);
}

2887
static const struct iommu_ops amd_iommu_ops = {
2888
	.capable = amd_iommu_capable,
2889 2890
	.domain_alloc = amd_iommu_domain_alloc,
	.domain_free  = amd_iommu_domain_free,
2891 2892
	.attach_dev = amd_iommu_attach_device,
	.detach_dev = amd_iommu_detach_device,
2893 2894
	.map = amd_iommu_map,
	.unmap = amd_iommu_unmap,
O
Olav Haugan 已提交
2895
	.map_sg = default_iommu_map_sg,
2896
	.iova_to_phys = amd_iommu_iova_to_phys,
2897 2898
	.add_device = amd_iommu_add_device,
	.remove_device = amd_iommu_remove_device,
2899
	.device_group = amd_iommu_device_group,
2900 2901
	.get_dm_regions = amd_iommu_get_dm_regions,
	.put_dm_regions = amd_iommu_put_dm_regions,
2902
	.apply_dm_region = amd_iommu_apply_dm_region,
2903
	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
2904 2905
};

2906 2907 2908 2909 2910 2911 2912 2913 2914 2915
/*****************************************************************************
 *
 * The next functions do a basic initialization of IOMMU for pass through
 * mode
 *
 * In passthrough mode the IOMMU is initialized and enabled but not used for
 * DMA-API translation.
 *
 *****************************************************************************/

2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927
/* IOMMUv2 specific functions */
int amd_iommu_register_ppr_notifier(struct notifier_block *nb)
{
	return atomic_notifier_chain_register(&ppr_notifier, nb);
}
EXPORT_SYMBOL(amd_iommu_register_ppr_notifier);

int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb)
{
	return atomic_notifier_chain_unregister(&ppr_notifier, nb);
}
EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier);
2928 2929 2930

void amd_iommu_domain_direct_map(struct iommu_domain *dom)
{
2931
	struct protection_domain *domain = to_pdomain(dom);
2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948
	unsigned long flags;

	spin_lock_irqsave(&domain->lock, flags);

	/* Update data structure */
	domain->mode    = PAGE_MODE_NONE;
	domain->updated = true;

	/* Make changes visible to IOMMUs */
	update_domain(domain);

	/* Page-table is not visible to IOMMU anymore, so free it */
	free_pagetable(domain);

	spin_unlock_irqrestore(&domain->lock, flags);
}
EXPORT_SYMBOL(amd_iommu_domain_direct_map);
2949 2950 2951

int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
{
2952
	struct protection_domain *domain = to_pdomain(dom);
2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995
	unsigned long flags;
	int levels, ret;

	if (pasids <= 0 || pasids > (PASID_MASK + 1))
		return -EINVAL;

	/* Number of GCR3 table levels required */
	for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9)
		levels += 1;

	if (levels > amd_iommu_max_glx_val)
		return -EINVAL;

	spin_lock_irqsave(&domain->lock, flags);

	/*
	 * Save us all sanity checks whether devices already in the
	 * domain support IOMMUv2. Just force that the domain has no
	 * devices attached when it is switched into IOMMUv2 mode.
	 */
	ret = -EBUSY;
	if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK)
		goto out;

	ret = -ENOMEM;
	domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC);
	if (domain->gcr3_tbl == NULL)
		goto out;

	domain->glx      = levels;
	domain->flags   |= PD_IOMMUV2_MASK;
	domain->updated  = true;

	update_domain(domain);

	ret = 0;

out:
	spin_unlock_irqrestore(&domain->lock, flags);

	return ret;
}
EXPORT_SYMBOL(amd_iommu_domain_enable_v2);
2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029

static int __flush_pasid(struct protection_domain *domain, int pasid,
			 u64 address, bool size)
{
	struct iommu_dev_data *dev_data;
	struct iommu_cmd cmd;
	int i, ret;

	if (!(domain->flags & PD_IOMMUV2_MASK))
		return -EINVAL;

	build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size);

	/*
	 * IOMMU TLB needs to be flushed before Device TLB to
	 * prevent device TLB refill from IOMMU TLB
	 */
	for (i = 0; i < amd_iommus_present; ++i) {
		if (domain->dev_iommu[i] == 0)
			continue;

		ret = iommu_queue_command(amd_iommus[i], &cmd);
		if (ret != 0)
			goto out;
	}

	/* Wait until IOMMU TLB flushes are complete */
	domain_flush_complete(domain);

	/* Now flush device TLBs */
	list_for_each_entry(dev_data, &domain->dev_list, list) {
		struct amd_iommu *iommu;
		int qdep;

3030 3031 3032 3033 3034 3035
		/*
		   There might be non-IOMMUv2 capable devices in an IOMMUv2
		 * domain.
		 */
		if (!dev_data->ats.enabled)
			continue;
3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066

		qdep  = dev_data->ats.qdep;
		iommu = amd_iommu_rlookup_table[dev_data->devid];

		build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid,
				      qdep, address, size);

		ret = iommu_queue_command(iommu, &cmd);
		if (ret != 0)
			goto out;
	}

	/* Wait until all device TLBs are flushed */
	domain_flush_complete(domain);

	ret = 0;

out:

	return ret;
}

static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid,
				  u64 address)
{
	return __flush_pasid(domain, pasid, address, false);
}

int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
			 u64 address)
{
3067
	struct protection_domain *domain = to_pdomain(dom);
3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086
	unsigned long flags;
	int ret;

	spin_lock_irqsave(&domain->lock, flags);
	ret = __amd_iommu_flush_page(domain, pasid, address);
	spin_unlock_irqrestore(&domain->lock, flags);

	return ret;
}
EXPORT_SYMBOL(amd_iommu_flush_page);

static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid)
{
	return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
			     true);
}

int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid)
{
3087
	struct protection_domain *domain = to_pdomain(dom);
3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098
	unsigned long flags;
	int ret;

	spin_lock_irqsave(&domain->lock, flags);
	ret = __amd_iommu_flush_tlb(domain, pasid);
	spin_unlock_irqrestore(&domain->lock, flags);

	return ret;
}
EXPORT_SYMBOL(amd_iommu_flush_tlb);

3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166
static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
{
	int index;
	u64 *pte;

	while (true) {

		index = (pasid >> (9 * level)) & 0x1ff;
		pte   = &root[index];

		if (level == 0)
			break;

		if (!(*pte & GCR3_VALID)) {
			if (!alloc)
				return NULL;

			root = (void *)get_zeroed_page(GFP_ATOMIC);
			if (root == NULL)
				return NULL;

			*pte = __pa(root) | GCR3_VALID;
		}

		root = __va(*pte & PAGE_MASK);

		level -= 1;
	}

	return pte;
}

static int __set_gcr3(struct protection_domain *domain, int pasid,
		      unsigned long cr3)
{
	u64 *pte;

	if (domain->mode != PAGE_MODE_NONE)
		return -EINVAL;

	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
	if (pte == NULL)
		return -ENOMEM;

	*pte = (cr3 & PAGE_MASK) | GCR3_VALID;

	return __amd_iommu_flush_tlb(domain, pasid);
}

static int __clear_gcr3(struct protection_domain *domain, int pasid)
{
	u64 *pte;

	if (domain->mode != PAGE_MODE_NONE)
		return -EINVAL;

	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
	if (pte == NULL)
		return 0;

	*pte = 0;

	return __amd_iommu_flush_tlb(domain, pasid);
}

int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
			      unsigned long cr3)
{
3167
	struct protection_domain *domain = to_pdomain(dom);
3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180
	unsigned long flags;
	int ret;

	spin_lock_irqsave(&domain->lock, flags);
	ret = __set_gcr3(domain, pasid, cr3);
	spin_unlock_irqrestore(&domain->lock, flags);

	return ret;
}
EXPORT_SYMBOL(amd_iommu_domain_set_gcr3);

int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid)
{
3181
	struct protection_domain *domain = to_pdomain(dom);
3182 3183 3184 3185 3186 3187 3188 3189 3190 3191
	unsigned long flags;
	int ret;

	spin_lock_irqsave(&domain->lock, flags);
	ret = __clear_gcr3(domain, pasid);
	spin_unlock_irqrestore(&domain->lock, flags);

	return ret;
}
EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3);
3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208

int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
			   int status, int tag)
{
	struct iommu_dev_data *dev_data;
	struct amd_iommu *iommu;
	struct iommu_cmd cmd;

	dev_data = get_dev_data(&pdev->dev);
	iommu    = amd_iommu_rlookup_table[dev_data->devid];

	build_complete_ppr(&cmd, dev_data->devid, pasid, status,
			   tag, dev_data->pri_tlp);

	return iommu_queue_command(iommu, &cmd);
}
EXPORT_SYMBOL(amd_iommu_complete_ppr);
3209 3210 3211

struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev)
{
3212
	struct protection_domain *pdomain;
3213

3214 3215
	pdomain = get_domain(&pdev->dev);
	if (IS_ERR(pdomain))
3216 3217 3218
		return NULL;

	/* Only return IOMMUv2 domains */
3219
	if (!(pdomain->flags & PD_IOMMUV2_MASK))
3220 3221
		return NULL;

3222
	return &pdomain->domain;
3223 3224
}
EXPORT_SYMBOL(amd_iommu_get_v2_domain);
3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236

void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum)
{
	struct iommu_dev_data *dev_data;

	if (!amd_iommu_v2_supported())
		return;

	dev_data = get_dev_data(&pdev->dev);
	dev_data->errata |= (1 << erratum);
}
EXPORT_SYMBOL(amd_iommu_enable_device_erratum);
3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279

int amd_iommu_device_info(struct pci_dev *pdev,
                          struct amd_iommu_device_info *info)
{
	int max_pasids;
	int pos;

	if (pdev == NULL || info == NULL)
		return -EINVAL;

	if (!amd_iommu_v2_supported())
		return -EINVAL;

	memset(info, 0, sizeof(*info));

	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS);
	if (pos)
		info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;

	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
	if (pos)
		info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;

	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
	if (pos) {
		int features;

		max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1));
		max_pasids = min(max_pasids, (1 << 20));

		info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
		info->max_pasids = min(pci_max_pasids(pdev), max_pasids);

		features = pci_pasid_features(pdev);
		if (features & PCI_PASID_CAP_EXEC)
			info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
		if (features & PCI_PASID_CAP_PRIV)
			info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
	}

	return 0;
}
EXPORT_SYMBOL(amd_iommu_device_info);
3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303

#ifdef CONFIG_IRQ_REMAP

/*****************************************************************************
 *
 * Interrupt Remapping Implementation
 *
 *****************************************************************************/

union irte {
	u32 val;
	struct {
		u32 valid	: 1,
		    no_fault	: 1,
		    int_type	: 3,
		    rq_eoi	: 1,
		    dm		: 1,
		    rsvd_1	: 1,
		    destination	: 8,
		    vector	: 8,
		    rsvd_2	: 8;
	} fields;
};

3304 3305 3306 3307 3308
struct irq_2_irte {
	u16 devid; /* Device ID for IRTE table */
	u16 index; /* Index into IRTE table*/
};

3309 3310 3311 3312 3313 3314 3315 3316 3317 3318
struct amd_ir_data {
	struct irq_2_irte			irq_2_irte;
	union irte				irte_entry;
	union {
		struct msi_msg			msi_entry;
	};
};

static struct irq_chip amd_ir_chip;

3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370
#define DTE_IRQ_PHYS_ADDR_MASK	(((1ULL << 45)-1) << 6)
#define DTE_IRQ_REMAP_INTCTL    (2ULL << 60)
#define DTE_IRQ_TABLE_LEN       (8ULL << 1)
#define DTE_IRQ_REMAP_ENABLE    1ULL

static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
{
	u64 dte;

	dte	= amd_iommu_dev_table[devid].data[2];
	dte	&= ~DTE_IRQ_PHYS_ADDR_MASK;
	dte	|= virt_to_phys(table->table);
	dte	|= DTE_IRQ_REMAP_INTCTL;
	dte	|= DTE_IRQ_TABLE_LEN;
	dte	|= DTE_IRQ_REMAP_ENABLE;

	amd_iommu_dev_table[devid].data[2] = dte;
}

#define IRTE_ALLOCATED (~1U)

static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
{
	struct irq_remap_table *table = NULL;
	struct amd_iommu *iommu;
	unsigned long flags;
	u16 alias;

	write_lock_irqsave(&amd_iommu_devtable_lock, flags);

	iommu = amd_iommu_rlookup_table[devid];
	if (!iommu)
		goto out_unlock;

	table = irq_lookup_table[devid];
	if (table)
		goto out;

	alias = amd_iommu_alias_table[devid];
	table = irq_lookup_table[alias];
	if (table) {
		irq_lookup_table[devid] = table;
		set_dte_irq_entry(devid, table);
		iommu_flush_dte(iommu, devid);
		goto out;
	}

	/* Nothing there yet, allocate new irq remapping table */
	table = kzalloc(sizeof(*table), GFP_ATOMIC);
	if (!table)
		goto out;

3371 3372 3373
	/* Initialize table spin-lock */
	spin_lock_init(&table->lock);

3374 3375 3376 3377 3378 3379 3380
	if (ioapic)
		/* Keep the first 32 indexes free for IOAPIC interrupts */
		table->min_index = 32;

	table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC);
	if (!table->table) {
		kfree(table);
3381
		table = NULL;
3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398
		goto out;
	}

	memset(table->table, 0, MAX_IRQS_PER_TABLE * sizeof(u32));

	if (ioapic) {
		int i;

		for (i = 0; i < 32; ++i)
			table->table[i] = IRTE_ALLOCATED;
	}

	irq_lookup_table[devid] = table;
	set_dte_irq_entry(devid, table);
	iommu_flush_dte(iommu, devid);
	if (devid != alias) {
		irq_lookup_table[alias] = table;
3399
		set_dte_irq_entry(alias, table);
3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411
		iommu_flush_dte(iommu, alias);
	}

out:
	iommu_completion_wait(iommu);

out_unlock:
	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);

	return table;
}

3412
static int alloc_irq_index(u16 devid, int count)
3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495
{
	struct irq_remap_table *table;
	unsigned long flags;
	int index, c;

	table = get_irq_table(devid, false);
	if (!table)
		return -ENODEV;

	spin_lock_irqsave(&table->lock, flags);

	/* Scan table for free entries */
	for (c = 0, index = table->min_index;
	     index < MAX_IRQS_PER_TABLE;
	     ++index) {
		if (table->table[index] == 0)
			c += 1;
		else
			c = 0;

		if (c == count)	{
			for (; c != 0; --c)
				table->table[index - c + 1] = IRTE_ALLOCATED;

			index -= count - 1;
			goto out;
		}
	}

	index = -ENOSPC;

out:
	spin_unlock_irqrestore(&table->lock, flags);

	return index;
}

static int modify_irte(u16 devid, int index, union irte irte)
{
	struct irq_remap_table *table;
	struct amd_iommu *iommu;
	unsigned long flags;

	iommu = amd_iommu_rlookup_table[devid];
	if (iommu == NULL)
		return -EINVAL;

	table = get_irq_table(devid, false);
	if (!table)
		return -ENOMEM;

	spin_lock_irqsave(&table->lock, flags);
	table->table[index] = irte.val;
	spin_unlock_irqrestore(&table->lock, flags);

	iommu_flush_irt(iommu, devid);
	iommu_completion_wait(iommu);

	return 0;
}

static void free_irte(u16 devid, int index)
{
	struct irq_remap_table *table;
	struct amd_iommu *iommu;
	unsigned long flags;

	iommu = amd_iommu_rlookup_table[devid];
	if (iommu == NULL)
		return;

	table = get_irq_table(devid, false);
	if (!table)
		return;

	spin_lock_irqsave(&table->lock, flags);
	table->table[index] = 0;
	spin_unlock_irqrestore(&table->lock, flags);

	iommu_flush_irt(iommu, devid);
	iommu_completion_wait(iommu);
}

3496
static int get_devid(struct irq_alloc_info *info)
3497
{
3498
	int devid = -1;
3499

3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514
	switch (info->type) {
	case X86_IRQ_ALLOC_TYPE_IOAPIC:
		devid     = get_ioapic_devid(info->ioapic_id);
		break;
	case X86_IRQ_ALLOC_TYPE_HPET:
		devid     = get_hpet_devid(info->hpet_id);
		break;
	case X86_IRQ_ALLOC_TYPE_MSI:
	case X86_IRQ_ALLOC_TYPE_MSIX:
		devid = get_device_id(&info->msi_dev->dev);
		break;
	default:
		BUG_ON(1);
		break;
	}
3515

3516 3517
	return devid;
}
3518

3519 3520 3521 3522
static struct irq_domain *get_ir_irq_domain(struct irq_alloc_info *info)
{
	struct amd_iommu *iommu;
	int devid;
3523

3524 3525
	if (!info)
		return NULL;
3526

3527 3528 3529 3530 3531 3532
	devid = get_devid(info);
	if (devid >= 0) {
		iommu = amd_iommu_rlookup_table[devid];
		if (iommu)
			return iommu->ir_domain;
	}
3533

3534
	return NULL;
3535 3536
}

3537
static struct irq_domain *get_irq_domain(struct irq_alloc_info *info)
3538
{
3539 3540
	struct amd_iommu *iommu;
	int devid;
3541

3542 3543
	if (!info)
		return NULL;
3544

3545 3546 3547 3548
	switch (info->type) {
	case X86_IRQ_ALLOC_TYPE_MSI:
	case X86_IRQ_ALLOC_TYPE_MSIX:
		devid = get_device_id(&info->msi_dev->dev);
3549
		if (devid < 0)
3550 3551
			return NULL;

3552 3553 3554
		iommu = amd_iommu_rlookup_table[devid];
		if (iommu)
			return iommu->msi_domain;
3555 3556 3557 3558
		break;
	default:
		break;
	}
3559

3560 3561
	return NULL;
}
3562

3563 3564 3565 3566 3567 3568
struct irq_remap_ops amd_iommu_irq_ops = {
	.prepare		= amd_iommu_prepare,
	.enable			= amd_iommu_enable,
	.disable		= amd_iommu_disable,
	.reenable		= amd_iommu_reenable,
	.enable_faulting	= amd_iommu_enable_faulting,
3569 3570 3571
	.get_ir_irq_domain	= get_ir_irq_domain,
	.get_irq_domain		= get_irq_domain,
};
3572

3573 3574 3575 3576 3577 3578 3579 3580 3581
static void irq_remapping_prepare_irte(struct amd_ir_data *data,
				       struct irq_cfg *irq_cfg,
				       struct irq_alloc_info *info,
				       int devid, int index, int sub_handle)
{
	struct irq_2_irte *irte_info = &data->irq_2_irte;
	struct msi_msg *msg = &data->msi_entry;
	union irte *irte = &data->irte_entry;
	struct IO_APIC_route_entry *entry;
3582

3583 3584
	data->irq_2_irte.devid = devid;
	data->irq_2_irte.index = index + sub_handle;
3585

3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607
	/* Setup IRTE for IOMMU */
	irte->val = 0;
	irte->fields.vector      = irq_cfg->vector;
	irte->fields.int_type    = apic->irq_delivery_mode;
	irte->fields.destination = irq_cfg->dest_apicid;
	irte->fields.dm          = apic->irq_dest_mode;
	irte->fields.valid       = 1;

	switch (info->type) {
	case X86_IRQ_ALLOC_TYPE_IOAPIC:
		/* Setup IOAPIC entry */
		entry = info->ioapic_entry;
		info->ioapic_entry = NULL;
		memset(entry, 0, sizeof(*entry));
		entry->vector        = index;
		entry->mask          = 0;
		entry->trigger       = info->ioapic_trigger;
		entry->polarity      = info->ioapic_polarity;
		/* Mask level triggered irqs. */
		if (info->ioapic_trigger)
			entry->mask = 1;
		break;
3608

3609 3610 3611 3612 3613 3614 3615
	case X86_IRQ_ALLOC_TYPE_HPET:
	case X86_IRQ_ALLOC_TYPE_MSI:
	case X86_IRQ_ALLOC_TYPE_MSIX:
		msg->address_hi = MSI_ADDR_BASE_HI;
		msg->address_lo = MSI_ADDR_BASE_LO;
		msg->data = irte_info->index;
		break;
3616

3617 3618 3619 3620
	default:
		BUG_ON(1);
		break;
	}
3621 3622
}

3623 3624
static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
			       unsigned int nr_irqs, void *arg)
3625
{
3626 3627 3628
	struct irq_alloc_info *info = arg;
	struct irq_data *irq_data;
	struct amd_ir_data *data;
3629
	struct irq_cfg *cfg;
3630 3631
	int i, ret, devid;
	int index = -1;
3632

3633 3634 3635 3636
	if (!info)
		return -EINVAL;
	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI &&
	    info->type != X86_IRQ_ALLOC_TYPE_MSIX)
3637 3638
		return -EINVAL;

3639 3640 3641 3642 3643 3644
	/*
	 * With IRQ remapping enabled, don't need contiguous CPU vectors
	 * to support multiple MSI interrupts.
	 */
	if (info->type == X86_IRQ_ALLOC_TYPE_MSI)
		info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
3645

3646 3647 3648
	devid = get_devid(info);
	if (devid < 0)
		return -EINVAL;
3649

3650 3651 3652
	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
	if (ret < 0)
		return ret;
3653

3654 3655 3656 3657 3658 3659
	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
		if (get_irq_table(devid, true))
			index = info->ioapic_pin;
		else
			ret = -ENOMEM;
	} else {
3660
		index = alloc_irq_index(devid, nr_irqs);
3661 3662 3663 3664 3665
	}
	if (index < 0) {
		pr_warn("Failed to allocate IRTE\n");
		goto out_free_parent;
	}
3666

3667 3668 3669 3670 3671 3672 3673
	for (i = 0; i < nr_irqs; i++) {
		irq_data = irq_domain_get_irq_data(domain, virq + i);
		cfg = irqd_cfg(irq_data);
		if (!irq_data || !cfg) {
			ret = -EINVAL;
			goto out_free_data;
		}
3674

3675 3676 3677 3678 3679
		ret = -ENOMEM;
		data = kzalloc(sizeof(*data), GFP_KERNEL);
		if (!data)
			goto out_free_data;

3680 3681 3682 3683 3684 3685
		irq_data->hwirq = (devid << 16) + i;
		irq_data->chip_data = data;
		irq_data->chip = &amd_ir_chip;
		irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
	}
3686

3687
	return 0;
3688

3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699
out_free_data:
	for (i--; i >= 0; i--) {
		irq_data = irq_domain_get_irq_data(domain, virq + i);
		if (irq_data)
			kfree(irq_data->chip_data);
	}
	for (i = 0; i < nr_irqs; i++)
		free_irte(devid, index + i);
out_free_parent:
	irq_domain_free_irqs_common(domain, virq, nr_irqs);
	return ret;
3700 3701
}

3702 3703
static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
			       unsigned int nr_irqs)
3704
{
3705 3706 3707 3708
	struct irq_2_irte *irte_info;
	struct irq_data *irq_data;
	struct amd_ir_data *data;
	int i;
3709

3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720
	for (i = 0; i < nr_irqs; i++) {
		irq_data = irq_domain_get_irq_data(domain, virq  + i);
		if (irq_data && irq_data->chip_data) {
			data = irq_data->chip_data;
			irte_info = &data->irq_2_irte;
			free_irte(irte_info->devid, irte_info->index);
			kfree(data);
		}
	}
	irq_domain_free_irqs_common(domain, virq, nr_irqs);
}
3721

3722 3723 3724 3725 3726
static void irq_remapping_activate(struct irq_domain *domain,
				   struct irq_data *irq_data)
{
	struct amd_ir_data *data = irq_data->chip_data;
	struct irq_2_irte *irte_info = &data->irq_2_irte;
3727

3728
	modify_irte(irte_info->devid, irte_info->index, data->irte_entry);
3729 3730
}

3731 3732
static void irq_remapping_deactivate(struct irq_domain *domain,
				     struct irq_data *irq_data)
3733
{
3734 3735 3736
	struct amd_ir_data *data = irq_data->chip_data;
	struct irq_2_irte *irte_info = &data->irq_2_irte;
	union irte entry;
3737

3738 3739 3740
	entry.val = 0;
	modify_irte(irte_info->devid, irte_info->index, data->irte_entry);
}
3741

3742 3743 3744 3745 3746
static struct irq_domain_ops amd_ir_domain_ops = {
	.alloc = irq_remapping_alloc,
	.free = irq_remapping_free,
	.activate = irq_remapping_activate,
	.deactivate = irq_remapping_deactivate,
3747
};
3748

3749 3750 3751 3752 3753 3754 3755 3756
static int amd_ir_set_affinity(struct irq_data *data,
			       const struct cpumask *mask, bool force)
{
	struct amd_ir_data *ir_data = data->chip_data;
	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
	struct irq_cfg *cfg = irqd_cfg(data);
	struct irq_data *parent = data->parent_data;
	int ret;
3757

3758 3759 3760
	ret = parent->chip->irq_set_affinity(parent, mask, force);
	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
		return ret;
3761

3762 3763 3764 3765 3766 3767 3768
	/*
	 * Atomically updates the IRTE with the new destination, vector
	 * and flushes the interrupt entry cache.
	 */
	ir_data->irte_entry.fields.vector = cfg->vector;
	ir_data->irte_entry.fields.destination = cfg->dest_apicid;
	modify_irte(irte_info->devid, irte_info->index, ir_data->irte_entry);
3769

3770 3771 3772 3773 3774
	/*
	 * After this point, all the interrupts will start arriving
	 * at the new destination. So, time to cleanup the previous
	 * vector allocation.
	 */
3775
	send_cleanup_vector(cfg);
3776 3777

	return IRQ_SET_MASK_OK_DONE;
3778 3779
}

3780
static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
3781
{
3782
	struct amd_ir_data *ir_data = irq_data->chip_data;
3783

3784 3785
	*msg = ir_data->msi_entry;
}
3786

3787 3788 3789 3790 3791
static struct irq_chip amd_ir_chip = {
	.irq_ack = ir_ack_apic_edge,
	.irq_set_affinity = amd_ir_set_affinity,
	.irq_compose_msi_msg = ir_compose_msi_msg,
};
3792

3793 3794 3795 3796 3797
int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
{
	iommu->ir_domain = irq_domain_add_tree(NULL, &amd_ir_domain_ops, iommu);
	if (!iommu->ir_domain)
		return -ENOMEM;
3798

3799 3800
	iommu->ir_domain->parent = arch_get_ir_parent_domain();
	iommu->msi_domain = arch_create_msi_irq_domain(iommu->ir_domain);
3801 3802 3803

	return 0;
}
3804
#endif