amd_iommu.c 71.4 KB
Newer Older
1
/*
2
 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
 * Author: Joerg Roedel <joerg.roedel@amd.com>
 *         Leo Duran <leo.duran@amd.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 */

20
#include <linux/ratelimit.h>
21
#include <linux/pci.h>
22
#include <linux/pci-ats.h>
A
Akinobu Mita 已提交
23
#include <linux/bitmap.h>
24
#include <linux/slab.h>
25
#include <linux/debugfs.h>
26
#include <linux/scatterlist.h>
27
#include <linux/dma-mapping.h>
28
#include <linux/iommu-helper.h>
29
#include <linux/iommu.h>
30
#include <linux/delay.h>
31
#include <linux/amd-iommu.h>
32 33
#include <linux/notifier.h>
#include <linux/export.h>
34
#include <asm/msidef.h>
35
#include <asm/proto.h>
36
#include <asm/iommu.h>
37
#include <asm/gart.h>
38
#include <asm/dma.h>
39 40 41

#include "amd_iommu_proto.h"
#include "amd_iommu_types.h"
42 43 44

#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))

45
#define LOOP_TIMEOUT	100000
46

47 48
static DEFINE_RWLOCK(amd_iommu_devtable_lock);

49 50 51 52
/* A list of preallocated protection domains */
static LIST_HEAD(iommu_pd_list);
static DEFINE_SPINLOCK(iommu_pd_list_lock);

53 54 55 56
/* List of all available dev_data structures */
static LIST_HEAD(dev_data_list);
static DEFINE_SPINLOCK(dev_data_list_lock);

57 58 59 60 61 62
/*
 * Domain for untranslated devices - only allocated
 * if iommu=pt passed on kernel cmd line.
 */
static struct protection_domain *pt_domain;

63 64
static struct iommu_ops amd_iommu_ops;

65
static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
66
int amd_iommu_max_glx_val = -1;
67

68 69 70
/*
 * general struct to manage commands send to an IOMMU
 */
71
struct iommu_cmd {
72 73 74
	u32 data[4];
};

75
static void update_domain(struct protection_domain *domain);
76
static int __init alloc_passthrough_domain(void);
77

78 79 80 81 82 83
/****************************************************************************
 *
 * Helper functions
 *
 ****************************************************************************/

84
static struct iommu_dev_data *alloc_dev_data(u16 devid)
85 86 87 88 89 90 91 92
{
	struct iommu_dev_data *dev_data;
	unsigned long flags;

	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
	if (!dev_data)
		return NULL;

93
	dev_data->devid = devid;
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
	atomic_set(&dev_data->bind, 0);

	spin_lock_irqsave(&dev_data_list_lock, flags);
	list_add_tail(&dev_data->dev_data_list, &dev_data_list);
	spin_unlock_irqrestore(&dev_data_list_lock, flags);

	return dev_data;
}

static void free_dev_data(struct iommu_dev_data *dev_data)
{
	unsigned long flags;

	spin_lock_irqsave(&dev_data_list_lock, flags);
	list_del(&dev_data->dev_data_list);
	spin_unlock_irqrestore(&dev_data_list_lock, flags);

	kfree(dev_data);
}

114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
static struct iommu_dev_data *search_dev_data(u16 devid)
{
	struct iommu_dev_data *dev_data;
	unsigned long flags;

	spin_lock_irqsave(&dev_data_list_lock, flags);
	list_for_each_entry(dev_data, &dev_data_list, dev_data_list) {
		if (dev_data->devid == devid)
			goto out_unlock;
	}

	dev_data = NULL;

out_unlock:
	spin_unlock_irqrestore(&dev_data_list_lock, flags);

	return dev_data;
}

static struct iommu_dev_data *find_dev_data(u16 devid)
{
	struct iommu_dev_data *dev_data;

	dev_data = search_dev_data(devid);

	if (dev_data == NULL)
		dev_data = alloc_dev_data(devid);

	return dev_data;
}

145 146 147 148 149 150 151
static inline u16 get_device_id(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);

	return calc_devid(pdev->bus->number, pdev->devfn);
}

152 153 154 155 156
static struct iommu_dev_data *get_dev_data(struct device *dev)
{
	return dev->archdata.iommu;
}

157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
static bool pci_iommuv2_capable(struct pci_dev *pdev)
{
	static const int caps[] = {
		PCI_EXT_CAP_ID_ATS,
		PCI_PRI_CAP,
		PCI_PASID_CAP,
	};
	int i, pos;

	for (i = 0; i < 3; ++i) {
		pos = pci_find_ext_capability(pdev, caps[i]);
		if (pos == 0)
			return false;
	}

	return true;
}

175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
/*
 * In this function the list of preallocated protection domains is traversed to
 * find the domain for a specific device
 */
static struct dma_ops_domain *find_protection_domain(u16 devid)
{
	struct dma_ops_domain *entry, *ret = NULL;
	unsigned long flags;
	u16 alias = amd_iommu_alias_table[devid];

	if (list_empty(&iommu_pd_list))
		return NULL;

	spin_lock_irqsave(&iommu_pd_list_lock, flags);

	list_for_each_entry(entry, &iommu_pd_list, list) {
		if (entry->target_dev == devid ||
		    entry->target_dev == alias) {
			ret = entry;
			break;
		}
	}

	spin_unlock_irqrestore(&iommu_pd_list_lock, flags);

	return ret;
}

203 204 205 206 207 208 209 210 211 212 213 214
/*
 * This function checks if the driver got a valid device from the caller to
 * avoid dereferencing invalid pointers.
 */
static bool check_device(struct device *dev)
{
	u16 devid;

	if (!dev || !dev->dma_mask)
		return false;

	/* No device or no PCI device */
215
	if (dev->bus != &pci_bus_type)
216 217 218 219 220 221 222 223 224 225 226 227 228 229
		return false;

	devid = get_device_id(dev);

	/* Out of our scope? */
	if (devid > amd_iommu_last_bdf)
		return false;

	if (amd_iommu_rlookup_table[devid] == NULL)
		return false;

	return true;
}

230 231
static int iommu_init_device(struct device *dev)
{
232
	struct pci_dev *pdev = to_pci_dev(dev);
233
	struct iommu_dev_data *dev_data;
234
	u16 alias;
235 236 237 238

	if (dev->archdata.iommu)
		return 0;

239
	dev_data = find_dev_data(get_device_id(dev));
240 241 242
	if (!dev_data)
		return -ENOMEM;

243
	alias = amd_iommu_alias_table[dev_data->devid];
244
	if (alias != dev_data->devid) {
245
		struct iommu_dev_data *alias_data;
246

247 248 249 250
		alias_data = find_dev_data(alias);
		if (alias_data == NULL) {
			pr_err("AMD-Vi: Warning: Unhandled device %s\n",
					dev_name(dev));
251 252 253
			free_dev_data(dev_data);
			return -ENOTSUPP;
		}
254
		dev_data->alias_data = alias_data;
255
	}
256

257 258 259 260 261 262 263
	if (pci_iommuv2_capable(pdev)) {
		struct amd_iommu *iommu;

		iommu              = amd_iommu_rlookup_table[dev_data->devid];
		dev_data->iommu_v2 = iommu->is_iommu_v2;
	}

264 265 266 267 268
	dev->archdata.iommu = dev_data;

	return 0;
}

269 270 271 272 273 274 275 276 277 278 279 280 281 282
static void iommu_ignore_device(struct device *dev)
{
	u16 devid, alias;

	devid = get_device_id(dev);
	alias = amd_iommu_alias_table[devid];

	memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
	memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));

	amd_iommu_rlookup_table[devid] = NULL;
	amd_iommu_rlookup_table[alias] = NULL;
}

283 284
static void iommu_uninit_device(struct device *dev)
{
285 286 287 288 289
	/*
	 * Nothing to do here - we keep dev_data around for unplugged devices
	 * and reuse it when the device is re-plugged - not doing so would
	 * introduce a ton of races.
	 */
290
}
J
Joerg Roedel 已提交
291 292 293

void __init amd_iommu_uninit_devices(void)
{
294
	struct iommu_dev_data *dev_data, *n;
J
Joerg Roedel 已提交
295 296 297 298 299 300 301 302 303
	struct pci_dev *pdev = NULL;

	for_each_pci_dev(pdev) {

		if (!check_device(&pdev->dev))
			continue;

		iommu_uninit_device(&pdev->dev);
	}
304 305 306 307

	/* Free all of our dev_data structures */
	list_for_each_entry_safe(dev_data, n, &dev_data_list, dev_data_list)
		free_dev_data(dev_data);
J
Joerg Roedel 已提交
308 309 310 311 312 313 314 315 316 317 318 319 320
}

int __init amd_iommu_init_devices(void)
{
	struct pci_dev *pdev = NULL;
	int ret = 0;

	for_each_pci_dev(pdev) {

		if (!check_device(&pdev->dev))
			continue;

		ret = iommu_init_device(&pdev->dev);
321 322 323
		if (ret == -ENOTSUPP)
			iommu_ignore_device(&pdev->dev);
		else if (ret)
J
Joerg Roedel 已提交
324 325 326 327 328 329 330 331 332 333 334
			goto out_free;
	}

	return 0;

out_free:

	amd_iommu_uninit_devices();

	return ret;
}
335 336 337 338 339 340
#ifdef CONFIG_AMD_IOMMU_STATS

/*
 * Initialization code for statistics collection
 */

341
DECLARE_STATS_COUNTER(compl_wait);
342
DECLARE_STATS_COUNTER(cnt_map_single);
343
DECLARE_STATS_COUNTER(cnt_unmap_single);
344
DECLARE_STATS_COUNTER(cnt_map_sg);
345
DECLARE_STATS_COUNTER(cnt_unmap_sg);
346
DECLARE_STATS_COUNTER(cnt_alloc_coherent);
347
DECLARE_STATS_COUNTER(cnt_free_coherent);
348
DECLARE_STATS_COUNTER(cross_page);
349
DECLARE_STATS_COUNTER(domain_flush_single);
350
DECLARE_STATS_COUNTER(domain_flush_all);
351
DECLARE_STATS_COUNTER(alloced_io_mem);
352
DECLARE_STATS_COUNTER(total_map_requests);
353

354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
static struct dentry *stats_dir;
static struct dentry *de_fflush;

static void amd_iommu_stats_add(struct __iommu_counter *cnt)
{
	if (stats_dir == NULL)
		return;

	cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
				       &cnt->value);
}

static void amd_iommu_stats_init(void)
{
	stats_dir = debugfs_create_dir("amd-iommu", NULL);
	if (stats_dir == NULL)
		return;

	de_fflush  = debugfs_create_bool("fullflush", 0444, stats_dir,
					 (u32 *)&amd_iommu_unmap_flush);
374 375

	amd_iommu_stats_add(&compl_wait);
376
	amd_iommu_stats_add(&cnt_map_single);
377
	amd_iommu_stats_add(&cnt_unmap_single);
378
	amd_iommu_stats_add(&cnt_map_sg);
379
	amd_iommu_stats_add(&cnt_unmap_sg);
380
	amd_iommu_stats_add(&cnt_alloc_coherent);
381
	amd_iommu_stats_add(&cnt_free_coherent);
382
	amd_iommu_stats_add(&cross_page);
383
	amd_iommu_stats_add(&domain_flush_single);
384
	amd_iommu_stats_add(&domain_flush_all);
385
	amd_iommu_stats_add(&alloced_io_mem);
386
	amd_iommu_stats_add(&total_map_requests);
387 388 389 390
}

#endif

391 392 393 394 395 396
/****************************************************************************
 *
 * Interrupt handling functions
 *
 ****************************************************************************/

397 398 399 400
static void dump_dte_entry(u16 devid)
{
	int i;

401 402
	for (i = 0; i < 4; ++i)
		pr_err("AMD-Vi: DTE[%d]: %016llx\n", i,
403 404 405
			amd_iommu_dev_table[devid].data[i]);
}

406 407 408 409 410 411 412 413 414
static void dump_command(unsigned long phys_addr)
{
	struct iommu_cmd *cmd = phys_to_virt(phys_addr);
	int i;

	for (i = 0; i < 4; ++i)
		pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
}

415
static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
416 417 418 419 420 421 422 423
{
	u32 *event = __evt;
	int type  = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
	int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
	int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
	int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
	u64 address = (u64)(((u64)event[3]) << 32) | event[2];

424
	printk(KERN_ERR "AMD-Vi: Event logged [");
425 426 427 428 429 430 431

	switch (type) {
	case EVENT_TYPE_ILL_DEV:
		printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
		       "address=0x%016llx flags=0x%04x]\n",
		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
		       address, flags);
432
		dump_dte_entry(devid);
433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453
		break;
	case EVENT_TYPE_IO_FAULT:
		printk("IO_PAGE_FAULT device=%02x:%02x.%x "
		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
		       domid, address, flags);
		break;
	case EVENT_TYPE_DEV_TAB_ERR:
		printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
		       "address=0x%016llx flags=0x%04x]\n",
		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
		       address, flags);
		break;
	case EVENT_TYPE_PAGE_TAB_ERR:
		printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
		       domid, address, flags);
		break;
	case EVENT_TYPE_ILL_CMD:
		printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
454
		dump_command(address);
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
		break;
	case EVENT_TYPE_CMD_HARD_ERR:
		printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
		       "flags=0x%04x]\n", address, flags);
		break;
	case EVENT_TYPE_IOTLB_INV_TO:
		printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
		       "address=0x%016llx]\n",
		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
		       address);
		break;
	case EVENT_TYPE_INV_DEV_REQ:
		printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
		       "address=0x%016llx flags=0x%04x]\n",
		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
		       address, flags);
		break;
	default:
		printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
	}
}

static void iommu_poll_events(struct amd_iommu *iommu)
{
	u32 head, tail;
	unsigned long flags;

	spin_lock_irqsave(&iommu->lock, flags);

	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);

	while (head != tail) {
488
		iommu_print_event(iommu, iommu->evt_buf + head);
489 490 491 492 493 494 495 496
		head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
	}

	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);

	spin_unlock_irqrestore(&iommu->lock, flags);
}

497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u32 head)
{
	struct amd_iommu_fault fault;
	volatile u64 *raw;
	int i;

	raw = (u64 *)(iommu->ppr_log + head);

	/*
	 * Hardware bug: Interrupt may arrive before the entry is written to
	 * memory. If this happens we need to wait for the entry to arrive.
	 */
	for (i = 0; i < LOOP_TIMEOUT; ++i) {
		if (PPR_REQ_TYPE(raw[0]) != 0)
			break;
		udelay(1);
	}

	if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) {
		pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n");
		return;
	}

	fault.address   = raw[1];
	fault.pasid     = PPR_PASID(raw[0]);
	fault.device_id = PPR_DEVID(raw[0]);
	fault.tag       = PPR_TAG(raw[0]);
	fault.flags     = PPR_FLAGS(raw[0]);

	/*
	 * To detect the hardware bug we need to clear the entry
	 * to back to zero.
	 */
	raw[0] = raw[1] = 0;

	atomic_notifier_call_chain(&ppr_notifier, 0, &fault);
}

static void iommu_poll_ppr_log(struct amd_iommu *iommu)
{
	unsigned long flags;
	u32 head, tail;

	if (iommu->ppr_log == NULL)
		return;

	spin_lock_irqsave(&iommu->lock, flags);

	head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
	tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);

	while (head != tail) {

		/* Handle PPR entry */
		iommu_handle_ppr_entry(iommu, head);

		/* Update and refresh ring-buffer state*/
		head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
		writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
		tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
	}

	/* enable ppr interrupts again */
	writel(MMIO_STATUS_PPR_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET);

	spin_unlock_irqrestore(&iommu->lock, flags);
}

565
irqreturn_t amd_iommu_int_thread(int irq, void *data)
566
{
567 568
	struct amd_iommu *iommu;

569
	for_each_iommu(iommu) {
570
		iommu_poll_events(iommu);
571 572
		iommu_poll_ppr_log(iommu);
	}
573 574

	return IRQ_HANDLED;
575 576
}

577 578 579 580 581
irqreturn_t amd_iommu_int_handler(int irq, void *data)
{
	return IRQ_WAKE_THREAD;
}

582 583 584 585 586 587
/****************************************************************************
 *
 * IOMMU command queuing functions
 *
 ****************************************************************************/

588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607
static int wait_on_sem(volatile u64 *sem)
{
	int i = 0;

	while (*sem == 0 && i < LOOP_TIMEOUT) {
		udelay(1);
		i += 1;
	}

	if (i == LOOP_TIMEOUT) {
		pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
		return -EIO;
	}

	return 0;
}

static void copy_cmd_to_buffer(struct amd_iommu *iommu,
			       struct iommu_cmd *cmd,
			       u32 tail)
608 609 610
{
	u8 *target;

611
	target = iommu->cmd_buf + tail;
612 613 614 615 616 617
	tail   = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;

	/* Copy command to buffer */
	memcpy(target, cmd, sizeof(*cmd));

	/* Tell the IOMMU about it */
618
	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
619
}
620

621
static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
622
{
623 624
	WARN_ON(address & 0x7ULL);

625
	memset(cmd, 0, sizeof(*cmd));
626 627 628
	cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
	cmd->data[1] = upper_32_bits(__pa(address));
	cmd->data[2] = 1;
629 630 631
	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
}

632 633 634 635 636 637 638
static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
{
	memset(cmd, 0, sizeof(*cmd));
	cmd->data[0] = devid;
	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
}

639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669
static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
				  size_t size, u16 domid, int pde)
{
	u64 pages;
	int s;

	pages = iommu_num_pages(address, size, PAGE_SIZE);
	s     = 0;

	if (pages > 1) {
		/*
		 * If we have to flush more than one page, flush all
		 * TLB entries for this domain
		 */
		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
		s = 1;
	}

	address &= PAGE_MASK;

	memset(cmd, 0, sizeof(*cmd));
	cmd->data[1] |= domid;
	cmd->data[2]  = lower_32_bits(address);
	cmd->data[3]  = upper_32_bits(address);
	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
	if (s) /* size bit - we flush more than one 4kb page */
		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
	if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
}

670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700
static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
				  u64 address, size_t size)
{
	u64 pages;
	int s;

	pages = iommu_num_pages(address, size, PAGE_SIZE);
	s     = 0;

	if (pages > 1) {
		/*
		 * If we have to flush more than one page, flush all
		 * TLB entries for this domain
		 */
		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
		s = 1;
	}

	address &= PAGE_MASK;

	memset(cmd, 0, sizeof(*cmd));
	cmd->data[0]  = devid;
	cmd->data[0] |= (qdep & 0xff) << 24;
	cmd->data[1]  = devid;
	cmd->data[2]  = lower_32_bits(address);
	cmd->data[3]  = upper_32_bits(address);
	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
	if (s)
		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
}

701 702 703 704
static void build_inv_all(struct iommu_cmd *cmd)
{
	memset(cmd, 0, sizeof(*cmd));
	CMD_SET_TYPE(cmd, CMD_INV_ALL);
705 706
}

707 708
/*
 * Writes the command to the IOMMUs command buffer and informs the
709
 * hardware about the new command.
710
 */
711 712 713
static int iommu_queue_command_sync(struct amd_iommu *iommu,
				    struct iommu_cmd *cmd,
				    bool sync)
714
{
715
	u32 left, tail, head, next_tail;
716 717
	unsigned long flags;

718
	WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
719 720

again:
721 722
	spin_lock_irqsave(&iommu->lock, flags);

723 724 725 726
	head      = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
	tail      = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
	next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
	left      = (head - next_tail) % iommu->cmd_buf_size;
727

728 729 730 731
	if (left <= 2) {
		struct iommu_cmd sync_cmd;
		volatile u64 sem = 0;
		int ret;
732

733 734
		build_completion_wait(&sync_cmd, (u64)&sem);
		copy_cmd_to_buffer(iommu, &sync_cmd, tail);
735

736 737 738 739 740 741
		spin_unlock_irqrestore(&iommu->lock, flags);

		if ((ret = wait_on_sem(&sem)) != 0)
			return ret;

		goto again;
742 743
	}

744 745 746
	copy_cmd_to_buffer(iommu, cmd, tail);

	/* We need to sync now to make sure all commands are processed */
747
	iommu->need_sync = sync;
748

749
	spin_unlock_irqrestore(&iommu->lock, flags);
750

751
	return 0;
752 753
}

754 755 756 757 758
static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
{
	return iommu_queue_command_sync(iommu, cmd, true);
}

759 760 761 762
/*
 * This function queues a completion wait command into the command
 * buffer of an IOMMU
 */
763
static int iommu_completion_wait(struct amd_iommu *iommu)
764 765
{
	struct iommu_cmd cmd;
766
	volatile u64 sem = 0;
767
	int ret;
768

769
	if (!iommu->need_sync)
770
		return 0;
771

772
	build_completion_wait(&cmd, (u64)&sem);
773

774
	ret = iommu_queue_command_sync(iommu, &cmd, false);
775
	if (ret)
776
		return ret;
777

778
	return wait_on_sem(&sem);
779 780
}

781
static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
782
{
783
	struct iommu_cmd cmd;
784

785
	build_inv_dte(&cmd, devid);
786

787 788
	return iommu_queue_command(iommu, &cmd);
}
789

790 791 792
static void iommu_flush_dte_all(struct amd_iommu *iommu)
{
	u32 devid;
793

794 795
	for (devid = 0; devid <= 0xffff; ++devid)
		iommu_flush_dte(iommu, devid);
796

797 798
	iommu_completion_wait(iommu);
}
799

800 801 802 803 804 805 806
/*
 * This function uses heavy locking and may disable irqs for some time. But
 * this is no issue because it is only called during resume.
 */
static void iommu_flush_tlb_all(struct amd_iommu *iommu)
{
	u32 dom_id;
807

808 809 810 811 812 813
	for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
		struct iommu_cmd cmd;
		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
				      dom_id, 1);
		iommu_queue_command(iommu, &cmd);
	}
814

815
	iommu_completion_wait(iommu);
816 817
}

818
static void iommu_flush_all(struct amd_iommu *iommu)
819
{
820
	struct iommu_cmd cmd;
821

822
	build_inv_all(&cmd);
823

824 825 826 827
	iommu_queue_command(iommu, &cmd);
	iommu_completion_wait(iommu);
}

828 829
void iommu_flush_all_caches(struct amd_iommu *iommu)
{
830 831 832 833 834
	if (iommu_feature(iommu, FEATURE_IA)) {
		iommu_flush_all(iommu);
	} else {
		iommu_flush_dte_all(iommu);
		iommu_flush_tlb_all(iommu);
835 836 837
	}
}

838
/*
839
 * Command send function for flushing on-device TLB
840
 */
841 842
static int device_flush_iotlb(struct iommu_dev_data *dev_data,
			      u64 address, size_t size)
843 844
{
	struct amd_iommu *iommu;
845
	struct iommu_cmd cmd;
846
	int qdep;
847

848 849
	qdep     = dev_data->ats.qdep;
	iommu    = amd_iommu_rlookup_table[dev_data->devid];
850

851
	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size);
852 853

	return iommu_queue_command(iommu, &cmd);
854 855
}

856 857 858
/*
 * Command send function for invalidating a device table entry
 */
859
static int device_flush_dte(struct iommu_dev_data *dev_data)
860
{
861
	struct amd_iommu *iommu;
862
	int ret;
863

864
	iommu = amd_iommu_rlookup_table[dev_data->devid];
865

866
	ret = iommu_flush_dte(iommu, dev_data->devid);
867 868 869
	if (ret)
		return ret;

870
	if (dev_data->ats.enabled)
871
		ret = device_flush_iotlb(dev_data, 0, ~0UL);
872 873

	return ret;
874 875
}

876 877 878 879 880
/*
 * TLB invalidation function which is called from the mapping functions.
 * It invalidates a single PTE if the range to flush is within a single
 * page. Otherwise it flushes the whole TLB of the IOMMU.
 */
881 882
static void __domain_flush_pages(struct protection_domain *domain,
				 u64 address, size_t size, int pde)
883
{
884
	struct iommu_dev_data *dev_data;
885 886
	struct iommu_cmd cmd;
	int ret = 0, i;
887

888
	build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
889

890 891 892 893 894 895 896 897
	for (i = 0; i < amd_iommus_present; ++i) {
		if (!domain->dev_iommu[i])
			continue;

		/*
		 * Devices of this domain are behind this IOMMU
		 * We need a TLB flush
		 */
898
		ret |= iommu_queue_command(amd_iommus[i], &cmd);
899 900
	}

901 902
	list_for_each_entry(dev_data, &domain->dev_list, list) {

903
		if (!dev_data->ats.enabled)
904 905
			continue;

906
		ret |= device_flush_iotlb(dev_data, address, size);
907 908
	}

909
	WARN_ON(ret);
910 911
}

912 913
static void domain_flush_pages(struct protection_domain *domain,
			       u64 address, size_t size)
914
{
915
	__domain_flush_pages(domain, address, size, 0);
916
}
917

918
/* Flush the whole IO/TLB for a given protection domain */
919
static void domain_flush_tlb(struct protection_domain *domain)
920
{
921
	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
922 923
}

924
/* Flush the whole IO/TLB for a given protection domain - including PDE */
925
static void domain_flush_tlb_pde(struct protection_domain *domain)
926
{
927
	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
928 929
}

930
static void domain_flush_complete(struct protection_domain *domain)
931
{
932
	int i;
933

934 935 936
	for (i = 0; i < amd_iommus_present; ++i) {
		if (!domain->dev_iommu[i])
			continue;
937

938 939 940 941 942
		/*
		 * Devices of this domain are behind this IOMMU
		 * We need to wait for completion of all commands.
		 */
		iommu_completion_wait(amd_iommus[i]);
943
	}
944 945
}

946

947
/*
948
 * This function flushes the DTEs for all devices in domain
949
 */
950
static void domain_flush_devices(struct protection_domain *domain)
951
{
952
	struct iommu_dev_data *dev_data;
953

954
	list_for_each_entry(dev_data, &domain->dev_list, list)
955
		device_flush_dte(dev_data);
956 957
}

958 959 960 961 962 963 964
/****************************************************************************
 *
 * The functions below are used the create the page table mappings for
 * unity mapped regions.
 *
 ****************************************************************************/

965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993
/*
 * This function is used to add another level to an IO page table. Adding
 * another level increases the size of the address space by 9 bits to a size up
 * to 64 bits.
 */
static bool increase_address_space(struct protection_domain *domain,
				   gfp_t gfp)
{
	u64 *pte;

	if (domain->mode == PAGE_MODE_6_LEVEL)
		/* address space already 64 bit large */
		return false;

	pte = (void *)get_zeroed_page(gfp);
	if (!pte)
		return false;

	*pte             = PM_LEVEL_PDE(domain->mode,
					virt_to_phys(domain->pt_root));
	domain->pt_root  = pte;
	domain->mode    += 1;
	domain->updated  = true;

	return true;
}

static u64 *alloc_pte(struct protection_domain *domain,
		      unsigned long address,
994
		      unsigned long page_size,
995 996 997
		      u64 **pte_page,
		      gfp_t gfp)
{
998
	int level, end_lvl;
999
	u64 *pte, *page;
1000 1001

	BUG_ON(!is_power_of_2(page_size));
1002 1003 1004 1005

	while (address > PM_LEVEL_SIZE(domain->mode))
		increase_address_space(domain, gfp);

1006 1007 1008 1009
	level   = domain->mode - 1;
	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
	address = PAGE_SIZE_ALIGN(address, page_size);
	end_lvl = PAGE_SIZE_LEVEL(page_size);
1010 1011 1012 1013 1014 1015 1016 1017 1018

	while (level > end_lvl) {
		if (!IOMMU_PTE_PRESENT(*pte)) {
			page = (u64 *)get_zeroed_page(gfp);
			if (!page)
				return NULL;
			*pte = PM_LEVEL_PDE(level, virt_to_phys(page));
		}

1019 1020 1021 1022
		/* No level skipping support yet */
		if (PM_PTE_LEVEL(*pte) != level)
			return NULL;

1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039
		level -= 1;

		pte = IOMMU_PTE_PAGE(*pte);

		if (pte_page && level == end_lvl)
			*pte_page = pte;

		pte = &pte[PM_LEVEL_INDEX(level, address)];
	}

	return pte;
}

/*
 * This function checks if there is a PTE for a given dma address. If
 * there is one, it returns the pointer to it.
 */
1040
static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
1041 1042 1043 1044
{
	int level;
	u64 *pte;

1045 1046 1047 1048 1049
	if (address > PM_LEVEL_SIZE(domain->mode))
		return NULL;

	level   =  domain->mode - 1;
	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1050

1051 1052 1053
	while (level > 0) {

		/* Not Present */
1054 1055 1056
		if (!IOMMU_PTE_PRESENT(*pte))
			return NULL;

1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
		/* Large PTE */
		if (PM_PTE_LEVEL(*pte) == 0x07) {
			unsigned long pte_mask, __pte;

			/*
			 * If we have a series of large PTEs, make
			 * sure to return a pointer to the first one.
			 */
			pte_mask = PTE_PAGE_SIZE(*pte);
			pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
			__pte    = ((unsigned long)pte) & pte_mask;

			return (u64 *)__pte;
		}

		/* No level skipping support yet */
		if (PM_PTE_LEVEL(*pte) != level)
			return NULL;

1076 1077
		level -= 1;

1078
		/* Walk to the next level */
1079 1080 1081 1082 1083 1084 1085
		pte = IOMMU_PTE_PAGE(*pte);
		pte = &pte[PM_LEVEL_INDEX(level, address)];
	}

	return pte;
}

1086 1087 1088 1089 1090 1091 1092
/*
 * Generic mapping functions. It maps a physical address into a DMA
 * address space. It allocates the page table pages if necessary.
 * In the future it can be extended to a generic mapping function
 * supporting all features of AMD IOMMU page tables like level skipping
 * and full 64 bit address spaces.
 */
1093 1094 1095
static int iommu_map_page(struct protection_domain *dom,
			  unsigned long bus_addr,
			  unsigned long phys_addr,
1096
			  int prot,
1097
			  unsigned long page_size)
1098
{
1099
	u64 __pte, *pte;
1100
	int i, count;
1101

1102
	if (!(prot & IOMMU_PROT_MASK))
1103 1104
		return -EINVAL;

1105 1106 1107 1108 1109 1110 1111 1112
	bus_addr  = PAGE_ALIGN(bus_addr);
	phys_addr = PAGE_ALIGN(phys_addr);
	count     = PAGE_SIZE_PTE_COUNT(page_size);
	pte       = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);

	for (i = 0; i < count; ++i)
		if (IOMMU_PTE_PRESENT(pte[i]))
			return -EBUSY;
1113

1114 1115 1116 1117 1118
	if (page_size > PAGE_SIZE) {
		__pte = PAGE_SIZE_PTE(phys_addr, page_size);
		__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
	} else
		__pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
1119 1120 1121 1122 1123 1124

	if (prot & IOMMU_PROT_IR)
		__pte |= IOMMU_PTE_IR;
	if (prot & IOMMU_PROT_IW)
		__pte |= IOMMU_PTE_IW;

1125 1126
	for (i = 0; i < count; ++i)
		pte[i] = __pte;
1127

1128 1129
	update_domain(dom);

1130 1131 1132
	return 0;
}

1133 1134 1135
static unsigned long iommu_unmap_page(struct protection_domain *dom,
				      unsigned long bus_addr,
				      unsigned long page_size)
1136
{
1137 1138 1139 1140 1141 1142
	unsigned long long unmap_size, unmapped;
	u64 *pte;

	BUG_ON(!is_power_of_2(page_size));

	unmapped = 0;
1143

1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172
	while (unmapped < page_size) {

		pte = fetch_pte(dom, bus_addr);

		if (!pte) {
			/*
			 * No PTE for this address
			 * move forward in 4kb steps
			 */
			unmap_size = PAGE_SIZE;
		} else if (PM_PTE_LEVEL(*pte) == 0) {
			/* 4kb PTE found for this address */
			unmap_size = PAGE_SIZE;
			*pte       = 0ULL;
		} else {
			int count, i;

			/* Large PTE found which maps this address */
			unmap_size = PTE_PAGE_SIZE(*pte);
			count      = PAGE_SIZE_PTE_COUNT(unmap_size);
			for (i = 0; i < count; i++)
				pte[i] = 0ULL;
		}

		bus_addr  = (bus_addr & ~(unmap_size - 1)) + unmap_size;
		unmapped += unmap_size;
	}

	BUG_ON(!is_power_of_2(unmapped));
1173

1174
	return unmapped;
1175 1176
}

1177 1178 1179 1180
/*
 * This function checks if a specific unity mapping entry is needed for
 * this specific IOMMU.
 */
1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194
static int iommu_for_unity_map(struct amd_iommu *iommu,
			       struct unity_map_entry *entry)
{
	u16 bdf, i;

	for (i = entry->devid_start; i <= entry->devid_end; ++i) {
		bdf = amd_iommu_alias_table[i];
		if (amd_iommu_rlookup_table[bdf] == iommu)
			return 1;
	}

	return 0;
}

1195 1196 1197 1198
/*
 * This function actually applies the mapping to the page table of the
 * dma_ops domain.
 */
1199 1200 1201 1202 1203 1204 1205 1206
static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
			     struct unity_map_entry *e)
{
	u64 addr;
	int ret;

	for (addr = e->address_start; addr < e->address_end;
	     addr += PAGE_SIZE) {
1207
		ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
1208
				     PAGE_SIZE);
1209 1210 1211 1212 1213 1214 1215
		if (ret)
			return ret;
		/*
		 * if unity mapping is in aperture range mark the page
		 * as allocated in the aperture
		 */
		if (addr < dma_dom->aperture_size)
1216
			__set_bit(addr >> PAGE_SHIFT,
1217
				  dma_dom->aperture[0]->bitmap);
1218 1219 1220 1221 1222
	}

	return 0;
}

1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244
/*
 * Init the unity mappings for a specific IOMMU in the system
 *
 * Basically iterates over all unity mapping entries and applies them to
 * the default domain DMA of that IOMMU if necessary.
 */
static int iommu_init_unity_mappings(struct amd_iommu *iommu)
{
	struct unity_map_entry *entry;
	int ret;

	list_for_each_entry(entry, &amd_iommu_unity_map, list) {
		if (!iommu_for_unity_map(iommu, entry))
			continue;
		ret = dma_ops_unity_map(iommu->default_dom, entry);
		if (ret)
			return ret;
	}

	return 0;
}

1245 1246 1247
/*
 * Inits the unity mappings required for a specific device
 */
1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264
static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
					  u16 devid)
{
	struct unity_map_entry *e;
	int ret;

	list_for_each_entry(e, &amd_iommu_unity_map, list) {
		if (!(devid >= e->devid_start && devid <= e->devid_end))
			continue;
		ret = dma_ops_unity_map(dma_dom, e);
		if (ret)
			return ret;
	}

	return 0;
}

1265 1266 1267 1268 1269 1270 1271 1272 1273
/****************************************************************************
 *
 * The next functions belong to the address allocator for the dma_ops
 * interface functions. They work like the allocators in the other IOMMU
 * drivers. Its basically a bitmap which marks the allocated pages in
 * the aperture. Maybe it could be enhanced in the future to a more
 * efficient allocator.
 *
 ****************************************************************************/
1274

1275
/*
1276
 * The address allocator core functions.
1277 1278 1279
 *
 * called with domain->lock held
 */
1280

1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300
/*
 * Used to reserve address ranges in the aperture (e.g. for exclusion
 * ranges.
 */
static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
				      unsigned long start_page,
				      unsigned int pages)
{
	unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;

	if (start_page + pages > last_page)
		pages = last_page - start_page;

	for (i = start_page; i < start_page + pages; ++i) {
		int index = i / APERTURE_RANGE_PAGES;
		int page  = i % APERTURE_RANGE_PAGES;
		__set_bit(page, dom->aperture[index]->bitmap);
	}
}

1301 1302 1303 1304 1305
/*
 * This function is used to add a new aperture range to an existing
 * aperture in case of dma_ops domain allocation or address allocation
 * failure.
 */
1306
static int alloc_new_range(struct dma_ops_domain *dma_dom,
1307 1308 1309
			   bool populate, gfp_t gfp)
{
	int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
1310
	struct amd_iommu *iommu;
1311
	unsigned long i, old_size;
1312

1313 1314 1315 1316
#ifdef CONFIG_IOMMU_STRESS
	populate = false;
#endif

1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335
	if (index >= APERTURE_MAX_RANGES)
		return -ENOMEM;

	dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
	if (!dma_dom->aperture[index])
		return -ENOMEM;

	dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
	if (!dma_dom->aperture[index]->bitmap)
		goto out_free;

	dma_dom->aperture[index]->offset = dma_dom->aperture_size;

	if (populate) {
		unsigned long address = dma_dom->aperture_size;
		int i, num_ptes = APERTURE_RANGE_PAGES / 512;
		u64 *pte, *pte_page;

		for (i = 0; i < num_ptes; ++i) {
1336
			pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
1337 1338 1339 1340 1341 1342 1343 1344 1345 1346
					&pte_page, gfp);
			if (!pte)
				goto out_free;

			dma_dom->aperture[index]->pte_pages[i] = pte_page;

			address += APERTURE_RANGE_SIZE / 64;
		}
	}

1347
	old_size                = dma_dom->aperture_size;
1348 1349
	dma_dom->aperture_size += APERTURE_RANGE_SIZE;

1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361
	/* Reserve address range used for MSI messages */
	if (old_size < MSI_ADDR_BASE_LO &&
	    dma_dom->aperture_size > MSI_ADDR_BASE_LO) {
		unsigned long spage;
		int pages;

		pages = iommu_num_pages(MSI_ADDR_BASE_LO, 0x10000, PAGE_SIZE);
		spage = MSI_ADDR_BASE_LO >> PAGE_SHIFT;

		dma_ops_reserve_addresses(dma_dom, spage, pages);
	}

1362
	/* Initialize the exclusion range if necessary */
1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373
	for_each_iommu(iommu) {
		if (iommu->exclusion_start &&
		    iommu->exclusion_start >= dma_dom->aperture[index]->offset
		    && iommu->exclusion_start < dma_dom->aperture_size) {
			unsigned long startpage;
			int pages = iommu_num_pages(iommu->exclusion_start,
						    iommu->exclusion_length,
						    PAGE_SIZE);
			startpage = iommu->exclusion_start >> PAGE_SHIFT;
			dma_ops_reserve_addresses(dma_dom, startpage, pages);
		}
1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384
	}

	/*
	 * Check for areas already mapped as present in the new aperture
	 * range and mark those pages as reserved in the allocator. Such
	 * mappings may already exist as a result of requested unity
	 * mappings for devices.
	 */
	for (i = dma_dom->aperture[index]->offset;
	     i < dma_dom->aperture_size;
	     i += PAGE_SIZE) {
1385
		u64 *pte = fetch_pte(&dma_dom->domain, i);
1386 1387 1388
		if (!pte || !IOMMU_PTE_PRESENT(*pte))
			continue;

1389
		dma_ops_reserve_addresses(dma_dom, i >> PAGE_SHIFT, 1);
1390 1391
	}

1392 1393
	update_domain(&dma_dom->domain);

1394 1395 1396
	return 0;

out_free:
1397 1398
	update_domain(&dma_dom->domain);

1399 1400 1401 1402 1403 1404 1405 1406
	free_page((unsigned long)dma_dom->aperture[index]->bitmap);

	kfree(dma_dom->aperture[index]);
	dma_dom->aperture[index] = NULL;

	return -ENOMEM;
}

1407 1408 1409 1410 1411 1412 1413
static unsigned long dma_ops_area_alloc(struct device *dev,
					struct dma_ops_domain *dom,
					unsigned int pages,
					unsigned long align_mask,
					u64 dma_mask,
					unsigned long start)
{
1414
	unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
1415 1416 1417 1418 1419 1420
	int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
	int i = start >> APERTURE_RANGE_SHIFT;
	unsigned long boundary_size;
	unsigned long address = -1;
	unsigned long limit;

1421 1422
	next_bit >>= PAGE_SHIFT;

1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440
	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
			PAGE_SIZE) >> PAGE_SHIFT;

	for (;i < max_index; ++i) {
		unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;

		if (dom->aperture[i]->offset >= dma_mask)
			break;

		limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
					       dma_mask >> PAGE_SHIFT);

		address = iommu_area_alloc(dom->aperture[i]->bitmap,
					   limit, next_bit, pages, 0,
					    boundary_size, align_mask);
		if (address != -1) {
			address = dom->aperture[i]->offset +
				  (address << PAGE_SHIFT);
1441
			dom->next_address = address + (pages << PAGE_SHIFT);
1442 1443 1444 1445 1446 1447 1448 1449 1450
			break;
		}

		next_bit = 0;
	}

	return address;
}

1451 1452
static unsigned long dma_ops_alloc_addresses(struct device *dev,
					     struct dma_ops_domain *dom,
1453
					     unsigned int pages,
1454 1455
					     unsigned long align_mask,
					     u64 dma_mask)
1456 1457 1458
{
	unsigned long address;

1459 1460 1461 1462
#ifdef CONFIG_IOMMU_STRESS
	dom->next_address = 0;
	dom->need_flush = true;
#endif
1463

1464
	address = dma_ops_area_alloc(dev, dom, pages, align_mask,
1465
				     dma_mask, dom->next_address);
1466

1467
	if (address == -1) {
1468
		dom->next_address = 0;
1469 1470
		address = dma_ops_area_alloc(dev, dom, pages, align_mask,
					     dma_mask, 0);
1471 1472
		dom->need_flush = true;
	}
1473

1474
	if (unlikely(address == -1))
1475
		address = DMA_ERROR_CODE;
1476 1477 1478 1479 1480 1481

	WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);

	return address;
}

1482 1483 1484 1485 1486
/*
 * The address free function.
 *
 * called with domain->lock held
 */
1487 1488 1489 1490
static void dma_ops_free_addresses(struct dma_ops_domain *dom,
				   unsigned long address,
				   unsigned int pages)
{
1491 1492
	unsigned i = address >> APERTURE_RANGE_SHIFT;
	struct aperture_range *range = dom->aperture[i];
1493

1494 1495
	BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);

1496 1497 1498 1499
#ifdef CONFIG_IOMMU_STRESS
	if (i < 4)
		return;
#endif
1500

1501
	if (address >= dom->next_address)
1502
		dom->need_flush = true;
1503 1504

	address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
1505

A
Akinobu Mita 已提交
1506
	bitmap_clear(range->bitmap, address, pages);
1507

1508 1509
}

1510 1511 1512 1513 1514 1515 1516 1517 1518 1519
/****************************************************************************
 *
 * The next functions belong to the domain allocation. A domain is
 * allocated for every IOMMU as the default domain. If device isolation
 * is enabled, every device get its own domain. The most important thing
 * about domains is the page table mapping the DMA address space they
 * contain.
 *
 ****************************************************************************/

1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544
/*
 * This function adds a protection domain to the global protection domain list
 */
static void add_domain_to_list(struct protection_domain *domain)
{
	unsigned long flags;

	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
	list_add(&domain->list, &amd_iommu_pd_list);
	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
}

/*
 * This function removes a protection domain to the global
 * protection domain list
 */
static void del_domain_from_list(struct protection_domain *domain)
{
	unsigned long flags;

	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
	list_del(&domain->list);
	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
}

1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561
static u16 domain_id_alloc(void)
{
	unsigned long flags;
	int id;

	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
	BUG_ON(id == 0);
	if (id > 0 && id < MAX_DOMAIN_ID)
		__set_bit(id, amd_iommu_pd_alloc_bitmap);
	else
		id = 0;
	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);

	return id;
}

1562 1563 1564 1565 1566 1567 1568 1569 1570 1571
static void domain_id_free(int id)
{
	unsigned long flags;

	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
	if (id > 0 && id < MAX_DOMAIN_ID)
		__clear_bit(id, amd_iommu_pd_alloc_bitmap);
	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
}

1572
static void free_pagetable(struct protection_domain *domain)
1573 1574 1575 1576
{
	int i, j;
	u64 *p1, *p2, *p3;

1577
	p1 = domain->pt_root;
1578 1579 1580 1581 1582 1583 1584 1585 1586

	if (!p1)
		return;

	for (i = 0; i < 512; ++i) {
		if (!IOMMU_PTE_PRESENT(p1[i]))
			continue;

		p2 = IOMMU_PTE_PAGE(p1[i]);
1587
		for (j = 0; j < 512; ++j) {
1588 1589 1590 1591 1592 1593 1594 1595 1596 1597
			if (!IOMMU_PTE_PRESENT(p2[j]))
				continue;
			p3 = IOMMU_PTE_PAGE(p2[j]);
			free_page((unsigned long)p3);
		}

		free_page((unsigned long)p2);
	}

	free_page((unsigned long)p1);
1598 1599

	domain->pt_root = NULL;
1600 1601
}

1602 1603 1604 1605 1606
static void free_gcr3_table(struct protection_domain *domain)
{
	free_page((unsigned long)domain->gcr3_tbl);
}

1607 1608 1609 1610
/*
 * Free a domain, only used if something went wrong in the
 * allocation path and we need to free an already allocated page table
 */
1611 1612
static void dma_ops_domain_free(struct dma_ops_domain *dom)
{
1613 1614
	int i;

1615 1616 1617
	if (!dom)
		return;

1618 1619
	del_domain_from_list(&dom->domain);

1620
	free_pagetable(&dom->domain);
1621

1622 1623 1624 1625 1626 1627
	for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
		if (!dom->aperture[i])
			continue;
		free_page((unsigned long)dom->aperture[i]->bitmap);
		kfree(dom->aperture[i]);
	}
1628 1629 1630 1631

	kfree(dom);
}

1632 1633
/*
 * Allocates a new protection domain usable for the dma_ops functions.
1634
 * It also initializes the page table and the address allocator data
1635 1636
 * structures required for the dma_ops interface
 */
1637
static struct dma_ops_domain *dma_ops_domain_alloc(void)
1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649
{
	struct dma_ops_domain *dma_dom;

	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
	if (!dma_dom)
		return NULL;

	spin_lock_init(&dma_dom->domain.lock);

	dma_dom->domain.id = domain_id_alloc();
	if (dma_dom->domain.id == 0)
		goto free_dma_dom;
1650
	INIT_LIST_HEAD(&dma_dom->domain.dev_list);
1651
	dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1652
	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1653
	dma_dom->domain.flags = PD_DMA_OPS_MASK;
1654 1655 1656 1657
	dma_dom->domain.priv = dma_dom;
	if (!dma_dom->domain.pt_root)
		goto free_dma_dom;

1658
	dma_dom->need_flush = false;
1659
	dma_dom->target_dev = 0xffff;
1660

1661 1662
	add_domain_to_list(&dma_dom->domain);

1663
	if (alloc_new_range(dma_dom, true, GFP_KERNEL))
1664 1665
		goto free_dma_dom;

1666
	/*
1667 1668
	 * mark the first page as allocated so we never return 0 as
	 * a valid dma-address. So we can use 0 as error value
1669
	 */
1670
	dma_dom->aperture[0]->bitmap[0] = 1;
1671
	dma_dom->next_address = 0;
1672 1673 1674 1675 1676 1677 1678 1679 1680 1681


	return dma_dom;

free_dma_dom:
	dma_ops_domain_free(dma_dom);

	return NULL;
}

1682 1683 1684 1685 1686 1687 1688 1689 1690
/*
 * little helper function to check whether a given protection domain is a
 * dma_ops domain
 */
static bool dma_ops_domain(struct protection_domain *domain)
{
	return domain->flags & PD_DMA_OPS_MASK;
}

1691
static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
1692
{
1693
	u64 pte_root = 0;
1694
	u64 flags = 0;
1695

1696 1697 1698
	if (domain->mode != PAGE_MODE_NONE)
		pte_root = virt_to_phys(domain->pt_root);

1699 1700 1701
	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
		    << DEV_ENTRY_MODE_SHIFT;
	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1702

1703 1704
	flags = amd_iommu_dev_table[devid].data[1];

1705 1706 1707
	if (ats)
		flags |= DTE_FLAG_IOTLB;

1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733
	if (domain->flags & PD_IOMMUV2_MASK) {
		u64 gcr3 = __pa(domain->gcr3_tbl);
		u64 glx  = domain->glx;
		u64 tmp;

		pte_root |= DTE_FLAG_GV;
		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;

		/* First mask out possible old values for GCR3 table */
		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
		flags    &= ~tmp;

		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
		flags    &= ~tmp;

		/* Encode GCR3 table into DTE */
		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
		pte_root |= tmp;

		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
		flags    |= tmp;

		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
		flags    |= tmp;
	}

1734 1735 1736 1737 1738
	flags &= ~(0xffffUL);
	flags |= domain->id;

	amd_iommu_dev_table[devid].data[1]  = flags;
	amd_iommu_dev_table[devid].data[0]  = pte_root;
1739 1740 1741 1742 1743 1744 1745 1746 1747
}

static void clear_dte_entry(u16 devid)
{
	/* remove entry from the device table seen by the hardware */
	amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
	amd_iommu_dev_table[devid].data[1] = 0;

	amd_iommu_apply_erratum_63(devid);
1748 1749
}

1750 1751
static void do_attach(struct iommu_dev_data *dev_data,
		      struct protection_domain *domain)
1752 1753
{
	struct amd_iommu *iommu;
1754
	bool ats;
1755

1756 1757
	iommu = amd_iommu_rlookup_table[dev_data->devid];
	ats   = dev_data->ats.enabled;
1758 1759 1760 1761

	/* Update data structures */
	dev_data->domain = domain;
	list_add(&dev_data->list, &domain->dev_list);
1762
	set_dte_entry(dev_data->devid, domain, ats);
1763 1764 1765 1766 1767 1768

	/* Do reference counting */
	domain->dev_iommu[iommu->index] += 1;
	domain->dev_cnt                 += 1;

	/* Flush the DTE entry */
1769
	device_flush_dte(dev_data);
1770 1771
}

1772
static void do_detach(struct iommu_dev_data *dev_data)
1773 1774 1775
{
	struct amd_iommu *iommu;

1776
	iommu = amd_iommu_rlookup_table[dev_data->devid];
1777 1778

	/* decrease reference counters */
1779 1780 1781 1782 1783 1784
	dev_data->domain->dev_iommu[iommu->index] -= 1;
	dev_data->domain->dev_cnt                 -= 1;

	/* Update data structures */
	dev_data->domain = NULL;
	list_del(&dev_data->list);
1785
	clear_dte_entry(dev_data->devid);
1786

1787
	/* Flush the DTE entry */
1788
	device_flush_dte(dev_data);
1789 1790 1791 1792 1793 1794
}

/*
 * If a device is not yet associated with a domain, this function does
 * assigns it visible for the hardware
 */
1795
static int __attach_device(struct iommu_dev_data *dev_data,
1796
			   struct protection_domain *domain)
1797
{
1798
	int ret;
1799

1800 1801 1802
	/* lock domain */
	spin_lock(&domain->lock);

1803 1804
	if (dev_data->alias_data != NULL) {
		struct iommu_dev_data *alias_data = dev_data->alias_data;
1805

1806 1807 1808 1809 1810
		/* Some sanity checks */
		ret = -EBUSY;
		if (alias_data->domain != NULL &&
				alias_data->domain != domain)
			goto out_unlock;
1811

1812 1813 1814
		if (dev_data->domain != NULL &&
				dev_data->domain != domain)
			goto out_unlock;
1815

1816
		/* Do real assignment */
1817
		if (alias_data->domain == NULL)
1818
			do_attach(alias_data, domain);
1819 1820

		atomic_inc(&alias_data->bind);
1821
	}
1822

1823
	if (dev_data->domain == NULL)
1824
		do_attach(dev_data, domain);
1825

1826 1827
	atomic_inc(&dev_data->bind);

1828 1829 1830 1831
	ret = 0;

out_unlock:

1832 1833
	/* ready */
	spin_unlock(&domain->lock);
1834

1835
	return ret;
1836
}
1837

1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877

static void pdev_iommuv2_disable(struct pci_dev *pdev)
{
	pci_disable_ats(pdev);
	pci_disable_pri(pdev);
	pci_disable_pasid(pdev);
}

static int pdev_iommuv2_enable(struct pci_dev *pdev)
{
	int ret;

	/* Only allow access to user-accessible pages */
	ret = pci_enable_pasid(pdev, 0);
	if (ret)
		goto out_err;

	/* First reset the PRI state of the device */
	ret = pci_reset_pri(pdev);
	if (ret)
		goto out_err;

	/* FIXME: Hardcode number of outstanding requests for now */
	ret = pci_enable_pri(pdev, 32);
	if (ret)
		goto out_err;

	ret = pci_enable_ats(pdev, PAGE_SHIFT);
	if (ret)
		goto out_err;

	return 0;

out_err:
	pci_disable_pri(pdev);
	pci_disable_pasid(pdev);

	return ret;
}

1878 1879 1880 1881
/*
 * If a device is not yet associated with a domain, this function does
 * assigns it visible for the hardware
 */
1882 1883
static int attach_device(struct device *dev,
			 struct protection_domain *domain)
1884
{
1885
	struct pci_dev *pdev = to_pci_dev(dev);
1886
	struct iommu_dev_data *dev_data;
1887
	unsigned long flags;
1888
	int ret;
1889

1890 1891
	dev_data = get_dev_data(dev);

1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902
	if (domain->flags & PD_IOMMUV2_MASK) {
		if (!dev_data->iommu_v2 || !dev_data->passthrough)
			return -EINVAL;

		if (pdev_iommuv2_enable(pdev) != 0)
			return -EINVAL;

		dev_data->ats.enabled = true;
		dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
	} else if (amd_iommu_iotlb_sup &&
		   pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
1903 1904 1905
		dev_data->ats.enabled = true;
		dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
	}
1906

1907
	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1908
	ret = __attach_device(dev_data, domain);
1909 1910
	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);

1911 1912 1913 1914 1915
	/*
	 * We might boot into a crash-kernel here. The crashed kernel
	 * left the caches in the IOMMU dirty. So we have to flush
	 * here to evict all dirty stuff.
	 */
1916
	domain_flush_tlb_pde(domain);
1917 1918

	return ret;
1919 1920
}

1921 1922 1923
/*
 * Removes a device from a protection domain (unlocked)
 */
1924
static void __detach_device(struct iommu_dev_data *dev_data)
1925
{
1926
	struct protection_domain *domain;
1927
	unsigned long flags;
1928

1929
	BUG_ON(!dev_data->domain);
1930

1931 1932 1933
	domain = dev_data->domain;

	spin_lock_irqsave(&domain->lock, flags);
1934

1935 1936 1937
	if (dev_data->alias_data != NULL) {
		struct iommu_dev_data *alias_data = dev_data->alias_data;

1938
		if (atomic_dec_and_test(&alias_data->bind))
1939
			do_detach(alias_data);
1940 1941
	}

1942
	if (atomic_dec_and_test(&dev_data->bind))
1943
		do_detach(dev_data);
1944

1945
	spin_unlock_irqrestore(&domain->lock, flags);
1946 1947 1948

	/*
	 * If we run in passthrough mode the device must be assigned to the
1949 1950
	 * passthrough domain if it is detached from any other domain.
	 * Make sure we can deassign from the pt_domain itself.
1951
	 */
1952
	if (dev_data->passthrough &&
1953
	    (dev_data->domain == NULL && domain != pt_domain))
1954
		__attach_device(dev_data, pt_domain);
1955 1956 1957 1958 1959
}

/*
 * Removes a device from a protection domain (with devtable_lock held)
 */
1960
static void detach_device(struct device *dev)
1961
{
1962
	struct protection_domain *domain;
1963
	struct iommu_dev_data *dev_data;
1964 1965
	unsigned long flags;

1966
	dev_data = get_dev_data(dev);
1967
	domain   = dev_data->domain;
1968

1969 1970
	/* lock device table */
	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1971
	__detach_device(dev_data);
1972
	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1973

1974 1975 1976
	if (domain->flags & PD_IOMMUV2_MASK)
		pdev_iommuv2_disable(to_pci_dev(dev));
	else if (dev_data->ats.enabled)
1977
		pci_disable_ats(to_pci_dev(dev));
1978 1979

	dev_data->ats.enabled = false;
1980
}
1981

1982 1983 1984 1985 1986 1987
/*
 * Find out the protection domain structure for a given PCI device. This
 * will give us the pointer to the page table root for example.
 */
static struct protection_domain *domain_for_device(struct device *dev)
{
1988
	struct iommu_dev_data *dev_data;
1989
	struct protection_domain *dom = NULL;
1990 1991
	unsigned long flags;

1992
	dev_data   = get_dev_data(dev);
1993

1994 1995
	if (dev_data->domain)
		return dev_data->domain;
1996

1997 1998
	if (dev_data->alias_data != NULL) {
		struct iommu_dev_data *alias_data = dev_data->alias_data;
1999 2000 2001 2002 2003 2004 2005 2006

		read_lock_irqsave(&amd_iommu_devtable_lock, flags);
		if (alias_data->domain != NULL) {
			__attach_device(dev_data, alias_data->domain);
			dom = alias_data->domain;
		}
		read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
	}
2007 2008 2009 2010

	return dom;
}

2011 2012 2013 2014
static int device_change_notifier(struct notifier_block *nb,
				  unsigned long action, void *data)
{
	struct dma_ops_domain *dma_domain;
2015 2016 2017
	struct protection_domain *domain;
	struct iommu_dev_data *dev_data;
	struct device *dev = data;
2018
	struct amd_iommu *iommu;
2019
	unsigned long flags;
2020
	u16 devid;
2021

2022 2023
	if (!check_device(dev))
		return 0;
2024

2025 2026 2027
	devid    = get_device_id(dev);
	iommu    = amd_iommu_rlookup_table[devid];
	dev_data = get_dev_data(dev);
2028 2029

	switch (action) {
2030
	case BUS_NOTIFY_UNBOUND_DRIVER:
2031 2032 2033

		domain = domain_for_device(dev);

2034 2035
		if (!domain)
			goto out;
2036
		if (dev_data->passthrough)
2037
			break;
2038
		detach_device(dev);
2039 2040
		break;
	case BUS_NOTIFY_ADD_DEVICE:
2041 2042 2043 2044 2045

		iommu_init_device(dev);

		domain = domain_for_device(dev);

2046 2047 2048 2049
		/* allocate a protection domain if a device is added */
		dma_domain = find_protection_domain(devid);
		if (dma_domain)
			goto out;
2050
		dma_domain = dma_ops_domain_alloc();
2051 2052 2053 2054 2055 2056 2057 2058
		if (!dma_domain)
			goto out;
		dma_domain->target_dev = devid;

		spin_lock_irqsave(&iommu_pd_list_lock, flags);
		list_add_tail(&dma_domain->list, &iommu_pd_list);
		spin_unlock_irqrestore(&iommu_pd_list_lock, flags);

2059
		break;
2060 2061 2062 2063
	case BUS_NOTIFY_DEL_DEVICE:

		iommu_uninit_device(dev);

2064 2065 2066 2067 2068 2069 2070 2071 2072 2073
	default:
		goto out;
	}

	iommu_completion_wait(iommu);

out:
	return 0;
}

2074
static struct notifier_block device_nb = {
2075 2076
	.notifier_call = device_change_notifier,
};
2077

2078 2079 2080 2081 2082
void amd_iommu_init_notifier(void)
{
	bus_register_notifier(&pci_bus_type, &device_nb);
}

2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095
/*****************************************************************************
 *
 * The next functions belong to the dma_ops mapping/unmapping code.
 *
 *****************************************************************************/

/*
 * In the dma_ops path we only have the struct device. This function
 * finds the corresponding IOMMU, the protection domain and the
 * requestor id for a given device.
 * If the device is not yet associated with a domain this is also done
 * in this function.
 */
2096
static struct protection_domain *get_domain(struct device *dev)
2097
{
2098
	struct protection_domain *domain;
2099
	struct dma_ops_domain *dma_dom;
2100
	u16 devid = get_device_id(dev);
2101

2102
	if (!check_device(dev))
2103
		return ERR_PTR(-EINVAL);
2104

2105 2106 2107
	domain = domain_for_device(dev);
	if (domain != NULL && !dma_ops_domain(domain))
		return ERR_PTR(-EBUSY);
2108

2109 2110
	if (domain != NULL)
		return domain;
2111

2112
	/* Device not bount yet - bind it */
2113
	dma_dom = find_protection_domain(devid);
2114
	if (!dma_dom)
2115 2116
		dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
	attach_device(dev, &dma_dom->domain);
2117
	DUMP_printk("Using protection domain %d for device %s\n",
2118
		    dma_dom->domain.id, dev_name(dev));
2119

2120
	return &dma_dom->domain;
2121 2122
}

2123 2124
static void update_device_table(struct protection_domain *domain)
{
2125
	struct iommu_dev_data *dev_data;
2126

2127 2128
	list_for_each_entry(dev_data, &domain->dev_list, list)
		set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled);
2129 2130 2131 2132 2133 2134 2135 2136
}

static void update_domain(struct protection_domain *domain)
{
	if (!domain->updated)
		return;

	update_device_table(domain);
2137 2138 2139

	domain_flush_devices(domain);
	domain_flush_tlb_pde(domain);
2140 2141 2142 2143

	domain->updated = false;
}

2144 2145 2146 2147 2148 2149
/*
 * This function fetches the PTE for a given address in the aperture
 */
static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
			    unsigned long address)
{
2150
	struct aperture_range *aperture;
2151 2152
	u64 *pte, *pte_page;

2153 2154 2155 2156 2157
	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
	if (!aperture)
		return NULL;

	pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
2158
	if (!pte) {
2159
		pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
2160
				GFP_ATOMIC);
2161 2162
		aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
	} else
2163
		pte += PM_LEVEL_INDEX(0, address);
2164

2165
	update_domain(&dom->domain);
2166 2167 2168 2169

	return pte;
}

2170 2171 2172 2173
/*
 * This is the generic map function. It maps one 4kb page at paddr to
 * the given address in the DMA address space for the domain.
 */
2174
static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
2175 2176 2177 2178 2179 2180 2181 2182 2183 2184
				     unsigned long address,
				     phys_addr_t paddr,
				     int direction)
{
	u64 *pte, __pte;

	WARN_ON(address > dom->aperture_size);

	paddr &= PAGE_MASK;

2185
	pte  = dma_ops_get_pte(dom, address);
2186
	if (!pte)
2187
		return DMA_ERROR_CODE;
2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204

	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;

	if (direction == DMA_TO_DEVICE)
		__pte |= IOMMU_PTE_IR;
	else if (direction == DMA_FROM_DEVICE)
		__pte |= IOMMU_PTE_IW;
	else if (direction == DMA_BIDIRECTIONAL)
		__pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;

	WARN_ON(*pte);

	*pte = __pte;

	return (dma_addr_t)address;
}

2205 2206 2207
/*
 * The generic unmapping function for on page in the DMA address space.
 */
2208
static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
2209 2210
				 unsigned long address)
{
2211
	struct aperture_range *aperture;
2212 2213 2214 2215 2216
	u64 *pte;

	if (address >= dom->aperture_size)
		return;

2217 2218 2219 2220 2221 2222 2223
	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
	if (!aperture)
		return;

	pte  = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
	if (!pte)
		return;
2224

2225
	pte += PM_LEVEL_INDEX(0, address);
2226 2227 2228 2229 2230 2231

	WARN_ON(!*pte);

	*pte = 0ULL;
}

2232 2233
/*
 * This function contains common code for mapping of a physically
J
Joerg Roedel 已提交
2234 2235
 * contiguous memory region into DMA address space. It is used by all
 * mapping functions provided with this IOMMU driver.
2236 2237
 * Must be called with the domain lock held.
 */
2238 2239 2240 2241
static dma_addr_t __map_single(struct device *dev,
			       struct dma_ops_domain *dma_dom,
			       phys_addr_t paddr,
			       size_t size,
2242
			       int dir,
2243 2244
			       bool align,
			       u64 dma_mask)
2245 2246
{
	dma_addr_t offset = paddr & ~PAGE_MASK;
2247
	dma_addr_t address, start, ret;
2248
	unsigned int pages;
2249
	unsigned long align_mask = 0;
2250 2251
	int i;

2252
	pages = iommu_num_pages(paddr, size, PAGE_SIZE);
2253 2254
	paddr &= PAGE_MASK;

2255 2256
	INC_STATS_COUNTER(total_map_requests);

2257 2258 2259
	if (pages > 1)
		INC_STATS_COUNTER(cross_page);

2260 2261 2262
	if (align)
		align_mask = (1UL << get_order(size)) - 1;

2263
retry:
2264 2265
	address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
					  dma_mask);
2266
	if (unlikely(address == DMA_ERROR_CODE)) {
2267 2268 2269 2270 2271 2272 2273
		/*
		 * setting next_address here will let the address
		 * allocator only scan the new allocated range in the
		 * first run. This is a small optimization.
		 */
		dma_dom->next_address = dma_dom->aperture_size;

2274
		if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
2275 2276 2277
			goto out;

		/*
2278
		 * aperture was successfully enlarged by 128 MB, try
2279 2280 2281 2282
		 * allocation again
		 */
		goto retry;
	}
2283 2284 2285

	start = address;
	for (i = 0; i < pages; ++i) {
2286
		ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
2287
		if (ret == DMA_ERROR_CODE)
2288 2289
			goto out_unmap;

2290 2291 2292 2293 2294
		paddr += PAGE_SIZE;
		start += PAGE_SIZE;
	}
	address += offset;

2295 2296
	ADD_STATS_COUNTER(alloced_io_mem, size);

2297
	if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
2298
		domain_flush_tlb(&dma_dom->domain);
2299
		dma_dom->need_flush = false;
2300
	} else if (unlikely(amd_iommu_np_cache))
2301
		domain_flush_pages(&dma_dom->domain, address, size);
2302

2303 2304
out:
	return address;
2305 2306 2307 2308 2309

out_unmap:

	for (--i; i >= 0; --i) {
		start -= PAGE_SIZE;
2310
		dma_ops_domain_unmap(dma_dom, start);
2311 2312 2313 2314
	}

	dma_ops_free_addresses(dma_dom, address, pages);

2315
	return DMA_ERROR_CODE;
2316 2317
}

2318 2319 2320 2321
/*
 * Does the reverse of the __map_single function. Must be called with
 * the domain lock held too
 */
2322
static void __unmap_single(struct dma_ops_domain *dma_dom,
2323 2324 2325 2326
			   dma_addr_t dma_addr,
			   size_t size,
			   int dir)
{
2327
	dma_addr_t flush_addr;
2328 2329 2330
	dma_addr_t i, start;
	unsigned int pages;

2331
	if ((dma_addr == DMA_ERROR_CODE) ||
2332
	    (dma_addr + size > dma_dom->aperture_size))
2333 2334
		return;

2335
	flush_addr = dma_addr;
2336
	pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
2337 2338 2339 2340
	dma_addr &= PAGE_MASK;
	start = dma_addr;

	for (i = 0; i < pages; ++i) {
2341
		dma_ops_domain_unmap(dma_dom, start);
2342 2343 2344
		start += PAGE_SIZE;
	}

2345 2346
	SUB_STATS_COUNTER(alloced_io_mem, size);

2347
	dma_ops_free_addresses(dma_dom, dma_addr, pages);
2348

2349
	if (amd_iommu_unmap_flush || dma_dom->need_flush) {
2350
		domain_flush_pages(&dma_dom->domain, flush_addr, size);
2351 2352
		dma_dom->need_flush = false;
	}
2353 2354
}

2355 2356 2357
/*
 * The exported map_single function for dma_ops.
 */
2358 2359 2360 2361
static dma_addr_t map_page(struct device *dev, struct page *page,
			   unsigned long offset, size_t size,
			   enum dma_data_direction dir,
			   struct dma_attrs *attrs)
2362 2363 2364 2365
{
	unsigned long flags;
	struct protection_domain *domain;
	dma_addr_t addr;
2366
	u64 dma_mask;
2367
	phys_addr_t paddr = page_to_phys(page) + offset;
2368

2369 2370
	INC_STATS_COUNTER(cnt_map_single);

2371 2372
	domain = get_domain(dev);
	if (PTR_ERR(domain) == -EINVAL)
2373
		return (dma_addr_t)paddr;
2374 2375
	else if (IS_ERR(domain))
		return DMA_ERROR_CODE;
2376

2377 2378
	dma_mask = *dev->dma_mask;

2379
	spin_lock_irqsave(&domain->lock, flags);
2380

2381
	addr = __map_single(dev, domain->priv, paddr, size, dir, false,
2382
			    dma_mask);
2383
	if (addr == DMA_ERROR_CODE)
2384 2385
		goto out;

2386
	domain_flush_complete(domain);
2387 2388 2389 2390 2391 2392 2393

out:
	spin_unlock_irqrestore(&domain->lock, flags);

	return addr;
}

2394 2395 2396
/*
 * The exported unmap_single function for dma_ops.
 */
2397 2398
static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
		       enum dma_data_direction dir, struct dma_attrs *attrs)
2399 2400 2401 2402
{
	unsigned long flags;
	struct protection_domain *domain;

2403 2404
	INC_STATS_COUNTER(cnt_unmap_single);

2405 2406
	domain = get_domain(dev);
	if (IS_ERR(domain))
2407 2408
		return;

2409 2410
	spin_lock_irqsave(&domain->lock, flags);

2411
	__unmap_single(domain->priv, dma_addr, size, dir);
2412

2413
	domain_flush_complete(domain);
2414 2415 2416 2417

	spin_unlock_irqrestore(&domain->lock, flags);
}

2418 2419 2420 2421
/*
 * This is a special map_sg function which is used if we should map a
 * device which is not handled by an AMD IOMMU in the system.
 */
2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435
static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
			   int nelems, int dir)
{
	struct scatterlist *s;
	int i;

	for_each_sg(sglist, s, nelems, i) {
		s->dma_address = (dma_addr_t)sg_phys(s);
		s->dma_length  = s->length;
	}

	return nelems;
}

2436 2437 2438 2439
/*
 * The exported map_sg function for dma_ops (handles scatter-gather
 * lists).
 */
2440
static int map_sg(struct device *dev, struct scatterlist *sglist,
2441 2442
		  int nelems, enum dma_data_direction dir,
		  struct dma_attrs *attrs)
2443 2444 2445 2446 2447 2448 2449
{
	unsigned long flags;
	struct protection_domain *domain;
	int i;
	struct scatterlist *s;
	phys_addr_t paddr;
	int mapped_elems = 0;
2450
	u64 dma_mask;
2451

2452 2453
	INC_STATS_COUNTER(cnt_map_sg);

2454 2455
	domain = get_domain(dev);
	if (PTR_ERR(domain) == -EINVAL)
2456
		return map_sg_no_iommu(dev, sglist, nelems, dir);
2457 2458
	else if (IS_ERR(domain))
		return 0;
2459

2460
	dma_mask = *dev->dma_mask;
2461 2462 2463 2464 2465 2466

	spin_lock_irqsave(&domain->lock, flags);

	for_each_sg(sglist, s, nelems, i) {
		paddr = sg_phys(s);

2467
		s->dma_address = __map_single(dev, domain->priv,
2468 2469
					      paddr, s->length, dir, false,
					      dma_mask);
2470 2471 2472 2473 2474 2475 2476 2477

		if (s->dma_address) {
			s->dma_length = s->length;
			mapped_elems++;
		} else
			goto unmap;
	}

2478
	domain_flush_complete(domain);
2479 2480 2481 2482 2483 2484 2485 2486

out:
	spin_unlock_irqrestore(&domain->lock, flags);

	return mapped_elems;
unmap:
	for_each_sg(sglist, s, mapped_elems, i) {
		if (s->dma_address)
2487
			__unmap_single(domain->priv, s->dma_address,
2488 2489 2490 2491 2492 2493 2494 2495 2496
				       s->dma_length, dir);
		s->dma_address = s->dma_length = 0;
	}

	mapped_elems = 0;

	goto out;
}

2497 2498 2499 2500
/*
 * The exported map_sg function for dma_ops (handles scatter-gather
 * lists).
 */
2501
static void unmap_sg(struct device *dev, struct scatterlist *sglist,
2502 2503
		     int nelems, enum dma_data_direction dir,
		     struct dma_attrs *attrs)
2504 2505 2506 2507 2508 2509
{
	unsigned long flags;
	struct protection_domain *domain;
	struct scatterlist *s;
	int i;

2510 2511
	INC_STATS_COUNTER(cnt_unmap_sg);

2512 2513
	domain = get_domain(dev);
	if (IS_ERR(domain))
2514 2515
		return;

2516 2517 2518
	spin_lock_irqsave(&domain->lock, flags);

	for_each_sg(sglist, s, nelems, i) {
2519
		__unmap_single(domain->priv, s->dma_address,
2520 2521 2522 2523
			       s->dma_length, dir);
		s->dma_address = s->dma_length = 0;
	}

2524
	domain_flush_complete(domain);
2525 2526 2527 2528

	spin_unlock_irqrestore(&domain->lock, flags);
}

2529 2530 2531
/*
 * The exported alloc_coherent function for dma_ops.
 */
2532 2533 2534 2535 2536 2537 2538
static void *alloc_coherent(struct device *dev, size_t size,
			    dma_addr_t *dma_addr, gfp_t flag)
{
	unsigned long flags;
	void *virt_addr;
	struct protection_domain *domain;
	phys_addr_t paddr;
2539
	u64 dma_mask = dev->coherent_dma_mask;
2540

2541 2542
	INC_STATS_COUNTER(cnt_alloc_coherent);

2543 2544
	domain = get_domain(dev);
	if (PTR_ERR(domain) == -EINVAL) {
2545 2546 2547
		virt_addr = (void *)__get_free_pages(flag, get_order(size));
		*dma_addr = __pa(virt_addr);
		return virt_addr;
2548 2549
	} else if (IS_ERR(domain))
		return NULL;
2550

2551 2552 2553
	dma_mask  = dev->coherent_dma_mask;
	flag     &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
	flag     |= __GFP_ZERO;
2554 2555 2556

	virt_addr = (void *)__get_free_pages(flag, get_order(size));
	if (!virt_addr)
2557
		return NULL;
2558 2559 2560

	paddr = virt_to_phys(virt_addr);

2561 2562 2563
	if (!dma_mask)
		dma_mask = *dev->dma_mask;

2564 2565
	spin_lock_irqsave(&domain->lock, flags);

2566
	*dma_addr = __map_single(dev, domain->priv, paddr,
2567
				 size, DMA_BIDIRECTIONAL, true, dma_mask);
2568

2569
	if (*dma_addr == DMA_ERROR_CODE) {
J
Jiri Slaby 已提交
2570
		spin_unlock_irqrestore(&domain->lock, flags);
2571
		goto out_free;
J
Jiri Slaby 已提交
2572
	}
2573

2574
	domain_flush_complete(domain);
2575 2576 2577 2578

	spin_unlock_irqrestore(&domain->lock, flags);

	return virt_addr;
2579 2580 2581 2582 2583 2584

out_free:

	free_pages((unsigned long)virt_addr, get_order(size));

	return NULL;
2585 2586
}

2587 2588 2589
/*
 * The exported free_coherent function for dma_ops.
 */
2590 2591 2592 2593 2594 2595
static void free_coherent(struct device *dev, size_t size,
			  void *virt_addr, dma_addr_t dma_addr)
{
	unsigned long flags;
	struct protection_domain *domain;

2596 2597
	INC_STATS_COUNTER(cnt_free_coherent);

2598 2599
	domain = get_domain(dev);
	if (IS_ERR(domain))
2600 2601
		goto free_mem;

2602 2603
	spin_lock_irqsave(&domain->lock, flags);

2604
	__unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2605

2606
	domain_flush_complete(domain);
2607 2608 2609 2610 2611 2612 2613

	spin_unlock_irqrestore(&domain->lock, flags);

free_mem:
	free_pages((unsigned long)virt_addr, get_order(size));
}

2614 2615 2616 2617 2618 2619
/*
 * This function is called by the DMA layer to find out if we can handle a
 * particular device. It is part of the dma_ops.
 */
static int amd_iommu_dma_supported(struct device *dev, u64 mask)
{
2620
	return check_device(dev);
2621 2622
}

2623
/*
2624 2625
 * The function for pre-allocating protection domains.
 *
2626 2627 2628 2629
 * If the driver core informs the DMA layer if a driver grabs a device
 * we don't need to preallocate the protection domains anymore.
 * For now we have to.
 */
2630
static void prealloc_protection_domains(void)
2631
{
2632
	struct iommu_dev_data *dev_data;
2633
	struct dma_ops_domain *dma_dom;
2634
	struct pci_dev *dev = NULL;
2635
	u16 devid;
2636

2637
	for_each_pci_dev(dev) {
2638 2639 2640

		/* Do we handle this device? */
		if (!check_device(&dev->dev))
2641
			continue;
2642

2643 2644 2645 2646 2647 2648 2649 2650 2651 2652
		dev_data = get_dev_data(&dev->dev);
		if (!amd_iommu_force_isolation && dev_data->iommu_v2) {
			/* Make sure passthrough domain is allocated */
			alloc_passthrough_domain();
			dev_data->passthrough = true;
			attach_device(&dev->dev, pt_domain);
			pr_info("AMD-Vi: Using passthough domain for device %s\n",
				dev_name(&dev->dev));
		}

2653
		/* Is there already any domain for it? */
2654
		if (domain_for_device(&dev->dev))
2655
			continue;
2656 2657 2658

		devid = get_device_id(&dev->dev);

2659
		dma_dom = dma_ops_domain_alloc();
2660 2661 2662
		if (!dma_dom)
			continue;
		init_unity_mappings_for_device(dma_dom, devid);
2663 2664
		dma_dom->target_dev = devid;

2665
		attach_device(&dev->dev, &dma_dom->domain);
2666

2667
		list_add_tail(&dma_dom->list, &iommu_pd_list);
2668 2669 2670
	}
}

2671
static struct dma_map_ops amd_iommu_dma_ops = {
2672 2673
	.alloc_coherent = alloc_coherent,
	.free_coherent = free_coherent,
2674 2675
	.map_page = map_page,
	.unmap_page = unmap_page,
2676 2677
	.map_sg = map_sg,
	.unmap_sg = unmap_sg,
2678
	.dma_supported = amd_iommu_dma_supported,
2679 2680
};

2681 2682
static unsigned device_dma_ops_init(void)
{
2683
	struct iommu_dev_data *dev_data;
2684 2685 2686 2687 2688 2689 2690 2691 2692
	struct pci_dev *pdev = NULL;
	unsigned unhandled = 0;

	for_each_pci_dev(pdev) {
		if (!check_device(&pdev->dev)) {
			unhandled += 1;
			continue;
		}

2693 2694 2695 2696 2697 2698
		dev_data = get_dev_data(&pdev->dev);

		if (!dev_data->passthrough)
			pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
		else
			pdev->dev.archdata.dma_ops = &nommu_dma_ops;
2699 2700 2701 2702 2703
	}

	return unhandled;
}

2704 2705 2706
/*
 * The function which clues the AMD IOMMU driver into dma_ops.
 */
2707 2708 2709

void __init amd_iommu_init_api(void)
{
2710
	bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
2711 2712
}

2713 2714 2715
int __init amd_iommu_init_dma_ops(void)
{
	struct amd_iommu *iommu;
2716
	int ret, unhandled;
2717

2718 2719 2720 2721 2722
	/*
	 * first allocate a default protection domain for every IOMMU we
	 * found in the system. Devices not assigned to any other
	 * protection domain will be assigned to the default one.
	 */
2723
	for_each_iommu(iommu) {
2724
		iommu->default_dom = dma_ops_domain_alloc();
2725 2726
		if (iommu->default_dom == NULL)
			return -ENOMEM;
2727
		iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
2728 2729 2730 2731 2732
		ret = iommu_init_unity_mappings(iommu);
		if (ret)
			goto free_domains;
	}

2733
	/*
2734
	 * Pre-allocate the protection domains for each device.
2735
	 */
2736
	prealloc_protection_domains();
2737 2738

	iommu_detected = 1;
2739
	swiotlb = 0;
2740

2741
	/* Make the driver finally visible to the drivers */
2742 2743 2744 2745 2746
	unhandled = device_dma_ops_init();
	if (unhandled && max_pfn > MAX_DMA32_PFN) {
		/* There are unhandled devices - initialize swiotlb for them */
		swiotlb = 1;
	}
2747

2748 2749
	amd_iommu_stats_init();

2750 2751 2752 2753
	return 0;

free_domains:

2754
	for_each_iommu(iommu) {
2755 2756 2757 2758 2759 2760
		if (iommu->default_dom)
			dma_ops_domain_free(iommu->default_dom);
	}

	return ret;
}
2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773

/*****************************************************************************
 *
 * The following functions belong to the exported interface of AMD IOMMU
 *
 * This interface allows access to lower level functions of the IOMMU
 * like protection domain handling and assignement of devices to domains
 * which is not possible with the dma_ops interface.
 *
 *****************************************************************************/

static void cleanup_domain(struct protection_domain *domain)
{
2774
	struct iommu_dev_data *dev_data, *next;
2775 2776 2777 2778
	unsigned long flags;

	write_lock_irqsave(&amd_iommu_devtable_lock, flags);

2779
	list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2780
		__detach_device(dev_data);
2781 2782
		atomic_set(&dev_data->bind, 0);
	}
2783 2784 2785 2786

	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
}

2787 2788 2789 2790 2791
static void protection_domain_free(struct protection_domain *domain)
{
	if (!domain)
		return;

2792 2793
	del_domain_from_list(domain);

2794 2795 2796 2797 2798 2799 2800
	if (domain->id)
		domain_id_free(domain->id);

	kfree(domain);
}

static struct protection_domain *protection_domain_alloc(void)
2801 2802 2803 2804 2805
{
	struct protection_domain *domain;

	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
	if (!domain)
2806
		return NULL;
2807 2808

	spin_lock_init(&domain->lock);
2809
	mutex_init(&domain->api_lock);
2810 2811
	domain->id = domain_id_alloc();
	if (!domain->id)
2812
		goto out_err;
2813
	INIT_LIST_HEAD(&domain->dev_list);
2814

2815 2816
	add_domain_to_list(domain);

2817 2818 2819 2820 2821 2822 2823 2824
	return domain;

out_err:
	kfree(domain);

	return NULL;
}

2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838
static int __init alloc_passthrough_domain(void)
{
	if (pt_domain != NULL)
		return 0;

	/* allocate passthrough domain */
	pt_domain = protection_domain_alloc();
	if (!pt_domain)
		return -ENOMEM;

	pt_domain->mode = PAGE_MODE_NONE;

	return 0;
}
2839 2840 2841 2842 2843 2844
static int amd_iommu_domain_init(struct iommu_domain *dom)
{
	struct protection_domain *domain;

	domain = protection_domain_alloc();
	if (!domain)
2845
		goto out_free;
2846 2847

	domain->mode    = PAGE_MODE_3_LEVEL;
2848 2849 2850 2851 2852 2853 2854 2855 2856
	domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
	if (!domain->pt_root)
		goto out_free;

	dom->priv = domain;

	return 0;

out_free:
2857
	protection_domain_free(domain);
2858 2859 2860 2861

	return -ENOMEM;
}

2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873
static void amd_iommu_domain_destroy(struct iommu_domain *dom)
{
	struct protection_domain *domain = dom->priv;

	if (!domain)
		return;

	if (domain->dev_cnt > 0)
		cleanup_domain(domain);

	BUG_ON(domain->dev_cnt != 0);

2874 2875
	if (domain->mode != PAGE_MODE_NONE)
		free_pagetable(domain);
2876

2877 2878 2879
	if (domain->flags & PD_IOMMUV2_MASK)
		free_gcr3_table(domain);

2880
	protection_domain_free(domain);
2881 2882 2883 2884

	dom->priv = NULL;
}

2885 2886 2887
static void amd_iommu_detach_device(struct iommu_domain *dom,
				    struct device *dev)
{
2888
	struct iommu_dev_data *dev_data = dev->archdata.iommu;
2889 2890 2891
	struct amd_iommu *iommu;
	u16 devid;

2892
	if (!check_device(dev))
2893 2894
		return;

2895
	devid = get_device_id(dev);
2896

2897
	if (dev_data->domain != NULL)
2898
		detach_device(dev);
2899 2900 2901 2902 2903 2904 2905 2906

	iommu = amd_iommu_rlookup_table[devid];
	if (!iommu)
		return;

	iommu_completion_wait(iommu);
}

2907 2908 2909 2910
static int amd_iommu_attach_device(struct iommu_domain *dom,
				   struct device *dev)
{
	struct protection_domain *domain = dom->priv;
2911
	struct iommu_dev_data *dev_data;
2912
	struct amd_iommu *iommu;
2913
	int ret;
2914

2915
	if (!check_device(dev))
2916 2917
		return -EINVAL;

2918 2919
	dev_data = dev->archdata.iommu;

2920
	iommu = amd_iommu_rlookup_table[dev_data->devid];
2921 2922 2923
	if (!iommu)
		return -EINVAL;

2924
	if (dev_data->domain)
2925
		detach_device(dev);
2926

2927
	ret = attach_device(dev, domain);
2928 2929 2930

	iommu_completion_wait(iommu);

2931
	return ret;
2932 2933
}

2934 2935
static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
			 phys_addr_t paddr, int gfp_order, int iommu_prot)
2936
{
2937
	unsigned long page_size = 0x1000UL << gfp_order;
2938 2939 2940 2941
	struct protection_domain *domain = dom->priv;
	int prot = 0;
	int ret;

2942 2943 2944
	if (domain->mode == PAGE_MODE_NONE)
		return -EINVAL;

2945 2946 2947 2948 2949
	if (iommu_prot & IOMMU_READ)
		prot |= IOMMU_PROT_IR;
	if (iommu_prot & IOMMU_WRITE)
		prot |= IOMMU_PROT_IW;

2950
	mutex_lock(&domain->api_lock);
2951
	ret = iommu_map_page(domain, iova, paddr, prot, page_size);
2952 2953
	mutex_unlock(&domain->api_lock);

2954
	return ret;
2955 2956
}

2957 2958
static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
			   int gfp_order)
2959 2960
{
	struct protection_domain *domain = dom->priv;
2961
	unsigned long page_size, unmap_size;
2962

2963 2964 2965
	if (domain->mode == PAGE_MODE_NONE)
		return -EINVAL;

2966
	page_size  = 0x1000UL << gfp_order;
2967

2968
	mutex_lock(&domain->api_lock);
2969
	unmap_size = iommu_unmap_page(domain, iova, page_size);
2970
	mutex_unlock(&domain->api_lock);
2971

2972
	domain_flush_tlb_pde(domain);
2973

2974
	return get_order(unmap_size);
2975 2976
}

2977 2978 2979 2980
static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
					  unsigned long iova)
{
	struct protection_domain *domain = dom->priv;
2981
	unsigned long offset_mask;
2982
	phys_addr_t paddr;
2983
	u64 *pte, __pte;
2984

2985 2986 2987
	if (domain->mode == PAGE_MODE_NONE)
		return iova;

2988
	pte = fetch_pte(domain, iova);
2989

2990
	if (!pte || !IOMMU_PTE_PRESENT(*pte))
2991 2992
		return 0;

2993 2994 2995 2996 2997 2998 2999
	if (PM_PTE_LEVEL(*pte) == 0)
		offset_mask = PAGE_SIZE - 1;
	else
		offset_mask = PTE_PAGE_SIZE(*pte) - 1;

	__pte = *pte & PM_ADDR_MASK;
	paddr = (__pte & ~offset_mask) | (iova & offset_mask);
3000 3001 3002 3003

	return paddr;
}

S
Sheng Yang 已提交
3004 3005 3006
static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
				    unsigned long cap)
{
3007 3008 3009 3010 3011
	switch (cap) {
	case IOMMU_CAP_CACHE_COHERENCY:
		return 1;
	}

S
Sheng Yang 已提交
3012 3013 3014
	return 0;
}

3015 3016 3017 3018 3019
static struct iommu_ops amd_iommu_ops = {
	.domain_init = amd_iommu_domain_init,
	.domain_destroy = amd_iommu_domain_destroy,
	.attach_dev = amd_iommu_attach_device,
	.detach_dev = amd_iommu_detach_device,
3020 3021
	.map = amd_iommu_map,
	.unmap = amd_iommu_unmap,
3022
	.iova_to_phys = amd_iommu_iova_to_phys,
S
Sheng Yang 已提交
3023
	.domain_has_cap = amd_iommu_domain_has_cap,
3024 3025
};

3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037
/*****************************************************************************
 *
 * The next functions do a basic initialization of IOMMU for pass through
 * mode
 *
 * In passthrough mode the IOMMU is initialized and enabled but not used for
 * DMA-API translation.
 *
 *****************************************************************************/

int __init amd_iommu_init_passthrough(void)
{
3038
	struct iommu_dev_data *dev_data;
3039
	struct pci_dev *dev = NULL;
3040
	struct amd_iommu *iommu;
3041
	u16 devid;
3042
	int ret;
3043

3044 3045 3046
	ret = alloc_passthrough_domain();
	if (ret)
		return ret;
3047

3048
	for_each_pci_dev(dev) {
3049
		if (!check_device(&dev->dev))
3050 3051
			continue;

3052 3053 3054
		dev_data = get_dev_data(&dev->dev);
		dev_data->passthrough = true;

3055 3056
		devid = get_device_id(&dev->dev);

3057
		iommu = amd_iommu_rlookup_table[devid];
3058 3059 3060
		if (!iommu)
			continue;

3061
		attach_device(&dev->dev, pt_domain);
3062 3063 3064 3065 3066 3067
	}

	pr_info("AMD-Vi: Initialized for Passthrough Mode\n");

	return 0;
}
3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080

/* IOMMUv2 specific functions */
int amd_iommu_register_ppr_notifier(struct notifier_block *nb)
{
	return atomic_notifier_chain_register(&ppr_notifier, nb);
}
EXPORT_SYMBOL(amd_iommu_register_ppr_notifier);

int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb)
{
	return atomic_notifier_chain_unregister(&ppr_notifier, nb);
}
EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier);
3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101

void amd_iommu_domain_direct_map(struct iommu_domain *dom)
{
	struct protection_domain *domain = dom->priv;
	unsigned long flags;

	spin_lock_irqsave(&domain->lock, flags);

	/* Update data structure */
	domain->mode    = PAGE_MODE_NONE;
	domain->updated = true;

	/* Make changes visible to IOMMUs */
	update_domain(domain);

	/* Page-table is not visible to IOMMU anymore, so free it */
	free_pagetable(domain);

	spin_unlock_irqrestore(&domain->lock, flags);
}
EXPORT_SYMBOL(amd_iommu_domain_direct_map);
3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148

int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
{
	struct protection_domain *domain = dom->priv;
	unsigned long flags;
	int levels, ret;

	if (pasids <= 0 || pasids > (PASID_MASK + 1))
		return -EINVAL;

	/* Number of GCR3 table levels required */
	for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9)
		levels += 1;

	if (levels > amd_iommu_max_glx_val)
		return -EINVAL;

	spin_lock_irqsave(&domain->lock, flags);

	/*
	 * Save us all sanity checks whether devices already in the
	 * domain support IOMMUv2. Just force that the domain has no
	 * devices attached when it is switched into IOMMUv2 mode.
	 */
	ret = -EBUSY;
	if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK)
		goto out;

	ret = -ENOMEM;
	domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC);
	if (domain->gcr3_tbl == NULL)
		goto out;

	domain->glx      = levels;
	domain->flags   |= PD_IOMMUV2_MASK;
	domain->updated  = true;

	update_domain(domain);

	ret = 0;

out:
	spin_unlock_irqrestore(&domain->lock, flags);

	return ret;
}
EXPORT_SYMBOL(amd_iommu_domain_enable_v2);