svm.c 25.1 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7 8
/*
 * Copyright © 2015 Intel Corporation.
 *
 * Authors: David Woodhouse <dwmw2@infradead.org>
 */

#include <linux/intel-iommu.h>
9 10
#include <linux/mmu_notifier.h>
#include <linux/sched.h>
11
#include <linux/sched/mm.h>
12 13 14 15 16
#include <linux/slab.h>
#include <linux/intel-svm.h>
#include <linux/rculist.h>
#include <linux/pci.h>
#include <linux/pci-ats.h>
17 18
#include <linux/dmar.h>
#include <linux/interrupt.h>
19
#include <linux/mm_types.h>
20
#include <linux/ioasid.h>
21
#include <asm/page.h>
22

L
Lu Baolu 已提交
23 24
#include "intel-pasid.h"

25
static irqreturn_t prq_event_thread(int irq, void *d);
26
static void intel_svm_drain_prq(struct device *dev, int pasid);
27

28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
#define PRQ_ORDER 0

int intel_svm_enable_prq(struct intel_iommu *iommu)
{
	struct page *pages;
	int irq, ret;

	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
	if (!pages) {
		pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
			iommu->name);
		return -ENOMEM;
	}
	iommu->prq = page_address(pages);

	irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
	if (irq <= 0) {
		pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
		       iommu->name);
		ret = -EINVAL;
	err:
		free_pages((unsigned long)iommu->prq, PRQ_ORDER);
		iommu->prq = NULL;
		return ret;
	}
	iommu->pr_irq = irq;

	snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);

	ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
				   iommu->prq_name, iommu);
	if (ret) {
		pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
		       iommu->name);
		dmar_free_hwirq(irq);
63
		iommu->pr_irq = 0;
64 65 66 67 68 69
		goto err;
	}
	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
	dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);

70 71
	init_completion(&iommu->prq_complete);

72 73 74 75 76 77 78 79 80
	return 0;
}

int intel_svm_finish_prq(struct intel_iommu *iommu)
{
	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
	dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);

81 82 83 84 85
	if (iommu->pr_irq) {
		free_irq(iommu->pr_irq, iommu);
		dmar_free_hwirq(iommu->pr_irq);
		iommu->pr_irq = 0;
	}
86 87 88 89 90 91 92

	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
	iommu->prq = NULL;

	return 0;
}

93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
static inline bool intel_svm_capable(struct intel_iommu *iommu)
{
	return iommu->flags & VTD_FLAG_SVM_CAPABLE;
}

void intel_svm_check(struct intel_iommu *iommu)
{
	if (!pasid_supported(iommu))
		return;

	if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
	    !cap_fl1gp_support(iommu->cap)) {
		pr_err("%s SVM disabled, incompatible 1GB page capability\n",
		       iommu->name);
		return;
	}

	if (cpu_feature_enabled(X86_FEATURE_LA57) &&
	    !cap_5lp_support(iommu->cap)) {
		pr_err("%s SVM disabled, incompatible paging mode\n",
		       iommu->name);
		return;
	}

	iommu->flags |= VTD_FLAG_SVM_CAPABLE;
}

120
static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev,
121
				unsigned long address, unsigned long pages, int ih)
122 123 124
{
	struct qi_desc desc;

125
	if (pages == -1) {
126 127 128 129
		desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
			QI_EIOTLB_DID(sdev->did) |
			QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
			QI_EIOTLB_TYPE;
130
		desc.qw1 = 0;
131
	} else {
132 133
		int mask = ilog2(__roundup_pow_of_two(pages));

134 135 136 137 138 139 140
		desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
				QI_EIOTLB_DID(sdev->did) |
				QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
				QI_EIOTLB_TYPE;
		desc.qw1 = QI_EIOTLB_ADDR(address) |
				QI_EIOTLB_IH(ih) |
				QI_EIOTLB_AM(mask);
141
	}
142 143
	desc.qw2 = 0;
	desc.qw3 = 0;
144
	qi_submit_sync(svm->iommu, &desc, 1, 0);
145 146

	if (sdev->dev_iotlb) {
147 148 149 150
		desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
				QI_DEV_EIOTLB_SID(sdev->sid) |
				QI_DEV_EIOTLB_QDEP(sdev->qdep) |
				QI_DEIOTLB_TYPE;
151
		if (pages == -1) {
152 153
			desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) |
					QI_DEV_EIOTLB_SIZE;
154 155 156 157 158
		} else if (pages > 1) {
			/* The least significant zero bit indicates the size. So,
			 * for example, an "address" value of 0x12345f000 will
			 * flush from 0x123440000 to 0x12347ffff (256KiB). */
			unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT);
159
			unsigned long mask = __rounddown_pow_of_two(address ^ last);
160

161 162
			desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) |
					(mask - 1)) | QI_DEV_EIOTLB_SIZE;
163
		} else {
164
			desc.qw1 = QI_DEV_EIOTLB_ADDR(address);
165
		}
166 167
		desc.qw2 = 0;
		desc.qw3 = 0;
168
		qi_submit_sync(svm->iommu, &desc, 1, 0);
169 170 171 172
	}
}

static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
173
				unsigned long pages, int ih)
174 175 176 177 178
{
	struct intel_svm_dev *sdev;

	rcu_read_lock();
	list_for_each_entry_rcu(sdev, &svm->devs, list)
179
		intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
180 181 182 183 184 185 186 187 188 189 190
	rcu_read_unlock();
}

/* Pages have been freed at this point */
static void intel_invalidate_range(struct mmu_notifier *mn,
				   struct mm_struct *mm,
				   unsigned long start, unsigned long end)
{
	struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);

	intel_flush_svm_range(svm, start,
191
			      (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0);
192 193 194 195 196
}

static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
{
	struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
197
	struct intel_svm_dev *sdev;
198

199 200 201 202 203 204 205 206 207 208 209 210 211
	/* This might end up being called from exit_mmap(), *before* the page
	 * tables are cleared. And __mmu_notifier_release() will delete us from
	 * the list of notifiers so that our invalidate_range() callback doesn't
	 * get called when the page tables are cleared. So we need to protect
	 * against hardware accessing those page tables.
	 *
	 * We do it by clearing the entry in the PASID table and then flushing
	 * the IOTLB and the PASID table caches. This might upset hardware;
	 * perhaps we'll want to point the PASID to a dummy PGD (like the zero
	 * page) so that we end up taking a fault that the hardware really
	 * *has* to handle gracefully without affecting other processes.
	 */
	rcu_read_lock();
212
	list_for_each_entry_rcu(sdev, &svm->devs, list)
213 214
		intel_pasid_tear_down_entry(svm->iommu, sdev->dev,
					    svm->pasid, true);
215
	rcu_read_unlock();
216 217 218 219 220 221 222 223 224

}

static const struct mmu_notifier_ops intel_mmuops = {
	.release = intel_mm_release,
	.invalidate_range = intel_invalidate_range,
};

static DEFINE_MUTEX(pasid_mutex);
225
static LIST_HEAD(global_svm_list);
226

227 228 229 230
#define for_each_svm_dev(sdev, svm, d)			\
	list_for_each_entry((sdev), &(svm)->devs, list)	\
		if ((d) != (sdev)->dev) {} else

231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
			  struct iommu_gpasid_bind_data *data)
{
	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
	struct dmar_domain *dmar_domain;
	struct intel_svm_dev *sdev;
	struct intel_svm *svm;
	int ret = 0;

	if (WARN_ON(!iommu) || !data)
		return -EINVAL;

	if (data->version != IOMMU_GPASID_BIND_VERSION_1 ||
	    data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
		return -EINVAL;

	if (!dev_is_pci(dev))
		return -ENOTSUPP;

	/* VT-d supports devices with full 20 bit PASIDs only */
	if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
		return -EINVAL;

	/*
	 * We only check host PASID range, we have no knowledge to check
	 * guest PASID range.
	 */
	if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
		return -EINVAL;

	dmar_domain = to_dmar_domain(domain);

	mutex_lock(&pasid_mutex);
	svm = ioasid_find(NULL, data->hpasid, NULL);
	if (IS_ERR(svm)) {
		ret = PTR_ERR(svm);
		goto out;
	}

	if (svm) {
		/*
		 * If we found svm for the PASID, there must be at
		 * least one device bond, otherwise svm should be freed.
		 */
		if (WARN_ON(list_empty(&svm->devs))) {
			ret = -EINVAL;
			goto out;
		}

280 281 282 283 284
		/*
		 * Do not allow multiple bindings of the same device-PASID since
		 * there is only one SL page tables per PASID. We may revisit
		 * once sharing PGD across domains are supported.
		 */
285
		for_each_svm_dev(sdev, svm, dev) {
286 287 288 289
			dev_warn_ratelimited(dev,
					     "Already bound with PASID %u\n",
					     svm->pasid);
			ret = -EBUSY;
290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
			goto out;
		}
	} else {
		/* We come here when PASID has never been bond to a device. */
		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
		if (!svm) {
			ret = -ENOMEM;
			goto out;
		}
		/* REVISIT: upper layer/VFIO can track host process that bind
		 * the PASID. ioasid_set = mm might be sufficient for vfio to
		 * check pasid VMM ownership. We can drop the following line
		 * once VFIO and IOASID set check is in place.
		 */
		svm->mm = get_task_mm(current);
		svm->pasid = data->hpasid;
		if (data->flags & IOMMU_SVA_GPASID_VAL) {
			svm->gpasid = data->gpasid;
			svm->flags |= SVM_FLAG_GUEST_PASID;
		}
		ioasid_set_data(data->hpasid, svm);
		INIT_LIST_HEAD_RCU(&svm->devs);
		mmput(svm->mm);
	}
	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
	if (!sdev) {
		ret = -ENOMEM;
		goto out;
	}
	sdev->dev = dev;

	/* Only count users if device has aux domains */
	if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
		sdev->users = 1;

	/* Set up device context entry for PASID if not enabled already */
	ret = intel_iommu_enable_pasid(iommu, sdev->dev);
	if (ret) {
		dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
		kfree(sdev);
		goto out;
	}

	/*
	 * PASID table is per device for better security. Therefore, for
	 * each bind of a new device even with an existing PASID, we need to
	 * call the nested mode setup function here.
	 */
	spin_lock(&iommu->lock);
339 340
	ret = intel_pasid_setup_nested(iommu, dev,
				       (pgd_t *)(uintptr_t)data->gpgd,
341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
				       data->hpasid, &data->vtd, dmar_domain,
				       data->addr_width);
	spin_unlock(&iommu->lock);
	if (ret) {
		dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
				    data->hpasid, ret);
		/*
		 * PASID entry should be in cleared state if nested mode
		 * set up failed. So we only need to clear IOASID tracking
		 * data such that free call will succeed.
		 */
		kfree(sdev);
		goto out;
	}

	svm->flags |= SVM_FLAG_GUEST_MODE;

	init_rcu_head(&sdev->rcu);
	list_add_rcu(&sdev->list, &svm->devs);
 out:
	if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
		ioasid_set_data(data->hpasid, NULL);
		kfree(svm);
	}

	mutex_unlock(&pasid_mutex);
	return ret;
}

int intel_svm_unbind_gpasid(struct device *dev, int pasid)
{
	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
	struct intel_svm_dev *sdev;
	struct intel_svm *svm;
	int ret = -EINVAL;

	if (WARN_ON(!iommu))
		return -EINVAL;

	mutex_lock(&pasid_mutex);
	svm = ioasid_find(NULL, pasid, NULL);
	if (!svm) {
		ret = -EINVAL;
		goto out;
	}

	if (IS_ERR(svm)) {
		ret = PTR_ERR(svm);
		goto out;
	}

	for_each_svm_dev(sdev, svm, dev) {
		ret = 0;
		if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
			sdev->users--;
		if (!sdev->users) {
			list_del_rcu(&sdev->list);
398 399
			intel_pasid_tear_down_entry(iommu, dev,
						    svm->pasid, false);
400
			intel_svm_drain_prq(dev, svm->pasid);
401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423
			kfree_rcu(sdev, rcu);

			if (list_empty(&svm->devs)) {
				/*
				 * We do not free the IOASID here in that
				 * IOMMU driver did not allocate it.
				 * Unlike native SVM, IOASID for guest use was
				 * allocated prior to the bind call.
				 * In any case, if the free call comes before
				 * the unbind, IOMMU driver will get notified
				 * and perform cleanup.
				 */
				ioasid_set_data(pasid, NULL);
				kfree(svm);
			}
		}
		break;
	}
out:
	mutex_unlock(&pasid_mutex);
	return ret;
}

424 425 426 427
/* Caller must hold pasid_mutex, mm reference */
static int
intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops,
		  struct mm_struct *mm, struct intel_svm_dev **sd)
428 429
{
	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
430
	struct device_domain_info *info;
431 432 433 434 435
	struct intel_svm_dev *sdev;
	struct intel_svm *svm = NULL;
	int pasid_max;
	int ret;

436
	if (!iommu || dmar_disabled)
437 438
		return -EINVAL;

439 440 441
	if (!intel_svm_capable(iommu))
		return -ENOTSUPP;

442 443 444 445 446 447 448
	if (dev_is_pci(dev)) {
		pasid_max = pci_max_pasids(to_pci_dev(dev));
		if (pasid_max < 0)
			return -EINVAL;
	} else
		pasid_max = 1 << 20;

449
	/* Bind supervisor PASID shuld have mm = NULL */
450
	if (flags & SVM_FLAG_SUPERVISOR_MODE) {
451 452
		if (!ecap_srs(iommu->ecap) || mm) {
			pr_err("Supervisor PASID with user provided mm.\n");
453
			return -EINVAL;
454
		}
455 456
	}

457
	if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
458
		struct intel_svm *t;
459

460 461
		list_for_each_entry(t, &global_svm_list, list) {
			if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
462 463
				continue;

464
			svm = t;
465 466 467 468 469 470 471 472
			if (svm->pasid >= pasid_max) {
				dev_warn(dev,
					 "Limited PASID width. Cannot use existing PASID %d\n",
					 svm->pasid);
				ret = -ENOSPC;
				goto out;
			}

473 474 475 476 477
			/* Find the matching device in svm list */
			for_each_svm_dev(sdev, svm, dev) {
				if (sdev->ops != ops) {
					ret = -EBUSY;
					goto out;
478
				}
479 480
				sdev->users++;
				goto success;
481 482 483 484 485 486 487 488 489 490 491 492 493
			}

			break;
		}
	}

	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
	if (!sdev) {
		ret = -ENOMEM;
		goto out;
	}
	sdev->dev = dev;

494
	ret = intel_iommu_enable_pasid(iommu, dev);
495
	if (ret) {
496 497 498
		kfree(sdev);
		goto out;
	}
499

500
	info = get_domain_info(dev);
501 502 503 504 505 506 507 508 509
	sdev->did = FLPT_DEFAULT_DID;
	sdev->sid = PCI_DEVID(info->bus, info->devfn);
	if (info->ats_enabled) {
		sdev->dev_iotlb = 1;
		sdev->qdep = info->ats_qdep;
		if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
			sdev->qdep = 0;
	}

510 511
	/* Finish the setup now we know we're keeping it */
	sdev->users = 1;
512
	sdev->ops = ops;
513 514 515 516 517 518 519 520 521 522 523
	init_rcu_head(&sdev->rcu);

	if (!svm) {
		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
		if (!svm) {
			ret = -ENOMEM;
			kfree(sdev);
			goto out;
		}
		svm->iommu = iommu;

524 525
		if (pasid_max > intel_pasid_max_id)
			pasid_max = intel_pasid_max_id;
526

527 528 529 530
		/* Do not use PASID 0, reserved for RID to PASID */
		svm->pasid = ioasid_alloc(NULL, PASID_MIN,
					  pasid_max - 1, svm);
		if (svm->pasid == INVALID_IOASID) {
531
			kfree(svm);
532
			kfree(sdev);
533
			ret = -ENOSPC;
534 535 536
			goto out;
		}
		svm->notifier.ops = &intel_mmuops;
537
		svm->mm = mm;
538
		svm->flags = flags;
539
		INIT_LIST_HEAD_RCU(&svm->devs);
540
		INIT_LIST_HEAD(&svm->list);
541
		ret = -ENOMEM;
542 543 544
		if (mm) {
			ret = mmu_notifier_register(&svm->notifier, mm);
			if (ret) {
545
				ioasid_free(svm->pasid);
546 547 548 549
				kfree(svm);
				kfree(sdev);
				goto out;
			}
550
		}
551

552 553 554 555
		spin_lock(&iommu->lock);
		ret = intel_pasid_setup_first_level(iommu, dev,
				mm ? mm->pgd : init_mm.pgd,
				svm->pasid, FLPT_DEFAULT_DID,
556 557 558
				(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
				(cpu_feature_enabled(X86_FEATURE_LA57) ?
				 PASID_FLAG_FL5LP : 0));
559 560 561 562
		spin_unlock(&iommu->lock);
		if (ret) {
			if (mm)
				mmu_notifier_unregister(&svm->notifier, mm);
563
			ioasid_free(svm->pasid);
564 565 566 567
			kfree(svm);
			kfree(sdev);
			goto out;
		}
568 569

		list_add_tail(&svm->list, &global_svm_list);
570 571 572 573 574 575 576 577 578
	} else {
		/*
		 * Binding a new device with existing PASID, need to setup
		 * the PASID entry.
		 */
		spin_lock(&iommu->lock);
		ret = intel_pasid_setup_first_level(iommu, dev,
						mm ? mm->pgd : init_mm.pgd,
						svm->pasid, FLPT_DEFAULT_DID,
579 580 581
						(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
						(cpu_feature_enabled(X86_FEATURE_LA57) ?
						PASID_FLAG_FL5LP : 0));
582 583 584 585 586
		spin_unlock(&iommu->lock);
		if (ret) {
			kfree(sdev);
			goto out;
		}
587 588
	}
	list_add_rcu(&sdev->list, &svm->devs);
589 590 591 592 593
success:
	sdev->pasid = svm->pasid;
	sdev->sva.dev = dev;
	if (sd)
		*sd = sdev;
594 595 596 597 598
	ret = 0;
 out:
	return ret;
}

599
/* Caller must hold pasid_mutex */
J
Jacob Pan 已提交
600
static int intel_svm_unbind_mm(struct device *dev, int pasid)
601 602 603 604 605 606 607
{
	struct intel_svm_dev *sdev;
	struct intel_iommu *iommu;
	struct intel_svm *svm;
	int ret = -EINVAL;

	iommu = intel_svm_device_to_iommu(dev);
608
	if (!iommu)
609 610
		goto out;

611
	svm = ioasid_find(NULL, pasid, NULL);
612 613 614
	if (!svm)
		goto out;

615 616 617 618 619
	if (IS_ERR(svm)) {
		ret = PTR_ERR(svm);
		goto out;
	}

620 621 622 623 624 625 626 627 628 629 630 631
	for_each_svm_dev(sdev, svm, dev) {
		ret = 0;
		sdev->users--;
		if (!sdev->users) {
			list_del_rcu(&sdev->list);
			/* Flush the PASID cache and IOTLB for this device.
			 * Note that we do depend on the hardware *not* using
			 * the PASID any more. Just as we depend on other
			 * devices never using PASIDs that they have no right
			 * to use. We have a *shared* PASID table, because it's
			 * large and has to be physically contiguous. So it's
			 * hard to be as defensive as we might like. */
632 633
			intel_pasid_tear_down_entry(iommu, dev,
						    svm->pasid, false);
634
			intel_svm_drain_prq(dev, svm->pasid);
635 636 637 638 639 640 641 642 643 644 645 646 647
			kfree_rcu(sdev, rcu);

			if (list_empty(&svm->devs)) {
				ioasid_free(svm->pasid);
				if (svm->mm)
					mmu_notifier_unregister(&svm->notifier, svm->mm);
				list_del(&svm->list);
				/* We mandate that no page faults may be outstanding
				 * for the PASID when intel_svm_unbind_mm() is called.
				 * If that is not obeyed, subtle errors will happen.
				 * Let's make them less subtle... */
				memset(svm, 0x6b, sizeof(*svm));
				kfree(svm);
648 649
			}
		}
650
		break;
651 652 653 654 655
	}
 out:

	return ret;
}
656

657 658
/* Page request queue descriptor */
struct page_req_dsc {
659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
	union {
		struct {
			u64 type:8;
			u64 pasid_present:1;
			u64 priv_data_present:1;
			u64 rsvd:6;
			u64 rid:16;
			u64 pasid:20;
			u64 exe_req:1;
			u64 pm_req:1;
			u64 rsvd2:10;
		};
		u64 qw_0;
	};
	union {
		struct {
			u64 rd_req:1;
			u64 wr_req:1;
			u64 lpig:1;
			u64 prg_index:9;
			u64 addr:52;
		};
		u64 qw_1;
	};
	u64 priv_data[2];
684 685
};

686
#define PRQ_RING_MASK	((0x1000 << PRQ_ORDER) - 0x20)
687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703

static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
{
	unsigned long requested = 0;

	if (req->exe_req)
		requested |= VM_EXEC;

	if (req->rd_req)
		requested |= VM_READ;

	if (req->wr_req)
		requested |= VM_WRITE;

	return (requested & ~vma->vm_flags) != 0;
}

704 705 706 707 708 709 710 711
static bool is_canonical_address(u64 addr)
{
	int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
	long saddr = (long) addr;

	return (((saddr << shift) >> shift) == saddr);
}

712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798
/**
 * intel_svm_drain_prq - Drain page requests and responses for a pasid
 * @dev: target device
 * @pasid: pasid for draining
 *
 * Drain all pending page requests and responses related to @pasid in both
 * software and hardware. This is supposed to be called after the device
 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
 * and DevTLB have been invalidated.
 *
 * It waits until all pending page requests for @pasid in the page fault
 * queue are completed by the prq handling thread. Then follow the steps
 * described in VT-d spec CH7.10 to drain all page requests and page
 * responses pending in the hardware.
 */
static void intel_svm_drain_prq(struct device *dev, int pasid)
{
	struct device_domain_info *info;
	struct dmar_domain *domain;
	struct intel_iommu *iommu;
	struct qi_desc desc[3];
	struct pci_dev *pdev;
	int head, tail;
	u16 sid, did;
	int qdep;

	info = get_domain_info(dev);
	if (WARN_ON(!info || !dev_is_pci(dev)))
		return;

	if (!info->pri_enabled)
		return;

	iommu = info->iommu;
	domain = info->domain;
	pdev = to_pci_dev(dev);
	sid = PCI_DEVID(info->bus, info->devfn);
	did = domain->iommu_did[iommu->seq_id];
	qdep = pci_ats_queue_depth(pdev);

	/*
	 * Check and wait until all pending page requests in the queue are
	 * handled by the prq handling thread.
	 */
prq_retry:
	reinit_completion(&iommu->prq_complete);
	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
	while (head != tail) {
		struct page_req_dsc *req;

		req = &iommu->prq[head / sizeof(*req)];
		if (!req->pasid_present || req->pasid != pasid) {
			head = (head + sizeof(*req)) & PRQ_RING_MASK;
			continue;
		}

		wait_for_completion(&iommu->prq_complete);
		goto prq_retry;
	}

	/*
	 * Perform steps described in VT-d spec CH7.10 to drain page
	 * requests and responses in hardware.
	 */
	memset(desc, 0, sizeof(desc));
	desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
			QI_IWD_FENCE |
			QI_IWD_TYPE;
	desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
			QI_EIOTLB_DID(did) |
			QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
			QI_EIOTLB_TYPE;
	desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
			QI_DEV_EIOTLB_SID(sid) |
			QI_DEV_EIOTLB_QDEP(qdep) |
			QI_DEIOTLB_TYPE |
			QI_DEV_IOTLB_PFSID(info->pfsid);
qi_retry:
	reinit_completion(&iommu->prq_complete);
	qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
	if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
		wait_for_completion(&iommu->prq_complete);
		goto qi_retry;
	}
}

799 800 801 802 803 804
static irqreturn_t prq_event_thread(int irq, void *d)
{
	struct intel_iommu *iommu = d;
	struct intel_svm *svm = NULL;
	int head, tail, handled = 0;

805 806 807 808
	/* Clear PPR bit before reading head/tail registers, to
	 * ensure that we get a new interrupt if needed. */
	writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);

809 810 811
	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
	while (head != tail) {
812
		struct intel_svm_dev *sdev;
813 814 815
		struct vm_area_struct *vma;
		struct page_req_dsc *req;
		struct qi_desc resp;
816 817
		int result;
		vm_fault_t ret;
818 819 820 821 822 823 824
		u64 address;

		handled = 1;

		req = &iommu->prq[head / sizeof(*req)];

		result = QI_RESP_FAILURE;
825
		address = (u64)req->addr << VTD_PAGE_SHIFT;
826 827 828 829
		if (!req->pasid_present) {
			pr_err("%s: Page request without PASID: %08llx %08llx\n",
			       iommu->name, ((unsigned long long *)req)[0],
			       ((unsigned long long *)req)[1]);
830
			goto no_pasid;
831 832 833 834
		}

		if (!svm || svm->pasid != req->pasid) {
			rcu_read_lock();
835
			svm = ioasid_find(NULL, req->pasid, NULL);
836 837 838 839
			/* It *can't* go away, because the driver is not permitted
			 * to unbind the mm while any page faults are outstanding.
			 * So we only need RCU to protect the internal idr code. */
			rcu_read_unlock();
840
			if (IS_ERR_OR_NULL(svm)) {
841 842 843
				pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
				       iommu->name, req->pasid, ((unsigned long long *)req)[0],
				       ((unsigned long long *)req)[1]);
844
				goto no_pasid;
845 846 847 848
			}
		}

		result = QI_RESP_INVALID;
849 850 851 852
		/* Since we're using init_mm.pgd directly, we should never take
		 * any faults on kernel addresses. */
		if (!svm->mm)
			goto bad_req;
853 854 855 856 857

		/* If address is not canonical, return invalid response */
		if (!is_canonical_address(address))
			goto bad_req;

J
Jacob Pan 已提交
858 859 860 861
		/* If the mm is already defunct, don't handle faults. */
		if (!mmget_not_zero(svm->mm))
			goto bad_req;

862
		mmap_read_lock(svm->mm);
863 864 865 866
		vma = find_extend_vma(svm->mm, address);
		if (!vma || address < vma->vm_start)
			goto invalid;

867 868 869
		if (access_error(vma, req))
			goto invalid;

870
		ret = handle_mm_fault(vma, address,
871 872 873 874 875 876
				      req->wr_req ? FAULT_FLAG_WRITE : 0);
		if (ret & VM_FAULT_ERROR)
			goto invalid;

		result = QI_RESP_SUCCESS;
	invalid:
877
		mmap_read_unlock(svm->mm);
878
		mmput(svm->mm);
879 880
	bad_req:
		/* Accounting for major/minor faults? */
881 882
		rcu_read_lock();
		list_for_each_entry_rcu(sdev, &svm->devs, list) {
883
			if (sdev->sid == req->rid)
884 885 886 887 888 889 890 891 892 893 894 895
				break;
		}
		/* Other devices can go away, but the drivers are not permitted
		 * to unbind while any page faults might be in flight. So it's
		 * OK to drop the 'lock' here now we have it. */
		rcu_read_unlock();

		if (WARN_ON(&sdev->list == &svm->devs))
			sdev = NULL;

		if (sdev && sdev->ops && sdev->ops->fault_cb) {
			int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
896 897 898
				(req->exe_req << 1) | (req->pm_req);
			sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
					    req->priv_data, rwxp, result);
899
		}
900 901 902 903 904
		/* We get here in the error case where the PASID lookup failed,
		   and these can be NULL. Do not use them below this point! */
		sdev = NULL;
		svm = NULL;
	no_pasid:
905 906 907 908 909 910 911 912
		if (req->lpig || req->priv_data_present) {
			/*
			 * Per VT-d spec. v3.0 ch7.7, system software must
			 * respond with page group response if private data
			 * is present (PDP) or last page in group (LPIG) bit
			 * is set. This is an additional VT-d feature beyond
			 * PCI ATS spec.
			 */
913
			resp.qw0 = QI_PGRP_PASID(req->pasid) |
914
				QI_PGRP_DID(req->rid) |
915
				QI_PGRP_PASID_P(req->pasid_present) |
916 917
				QI_PGRP_PDP(req->pasid_present) |
				QI_PGRP_RESP_CODE(result) |
918
				QI_PGRP_RESP_TYPE;
919
			resp.qw1 = QI_PGRP_IDX(req->prg_index) |
920 921 922 923 924
				QI_PGRP_LPIG(req->lpig);

			if (req->priv_data_present)
				memcpy(&resp.qw2, req->priv_data,
				       sizeof(req->priv_data));
925 926
			resp.qw2 = 0;
			resp.qw3 = 0;
927
			qi_submit_sync(iommu, &resp, 1, 0);
928 929 930 931 932 933
		}
		head = (head + sizeof(*req)) & PRQ_RING_MASK;
	}

	dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);

934 935 936 937 938 939 940 941 942 943
	/*
	 * Clear the page request overflow bit and wake up all threads that
	 * are waiting for the completion of this handling.
	 */
	if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO)
		writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);

	if (!completion_done(&iommu->prq_complete))
		complete(&iommu->prq_complete);

944 945
	return IRQ_RETVAL(handled);
}
946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998

#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
struct iommu_sva *
intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
{
	struct iommu_sva *sva = ERR_PTR(-EINVAL);
	struct intel_svm_dev *sdev = NULL;
	int flags = 0;
	int ret;

	/*
	 * TODO: Consolidate with generic iommu-sva bind after it is merged.
	 * It will require shared SVM data structures, i.e. combine io_mm
	 * and intel_svm etc.
	 */
	if (drvdata)
		flags = *(int *)drvdata;
	mutex_lock(&pasid_mutex);
	ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
	if (ret)
		sva = ERR_PTR(ret);
	else if (sdev)
		sva = &sdev->sva;
	else
		WARN(!sdev, "SVM bind succeeded with no sdev!\n");

	mutex_unlock(&pasid_mutex);

	return sva;
}

void intel_svm_unbind(struct iommu_sva *sva)
{
	struct intel_svm_dev *sdev;

	mutex_lock(&pasid_mutex);
	sdev = to_intel_svm_dev(sva);
	intel_svm_unbind_mm(sdev->dev, sdev->pasid);
	mutex_unlock(&pasid_mutex);
}

int intel_svm_get_pasid(struct iommu_sva *sva)
{
	struct intel_svm_dev *sdev;
	int pasid;

	mutex_lock(&pasid_mutex);
	sdev = to_intel_svm_dev(sva);
	pasid = sdev->pasid;
	mutex_unlock(&pasid_mutex);

	return pasid;
}