提交 f0f52a94 编写于 作者: L Linus Torvalds

Merge git://git.infradead.org/iommu-2.6

* git://git.infradead.org/iommu-2.6:
  intel-iommu: Fix off-by-one in RMRR setup
  intel-iommu: Add domain check in domain_remove_one_dev_info
  intel-iommu: Remove Host Bridge devices from identity mapping
  intel-iommu: Use coherent DMA mask when requested
  intel-iommu: Dont cache iova above 32bit
  intel-iommu: Speed up processing of the identity_mapping function
  intel-iommu: Check for identity mapping candidate using system dma mask
  intel-iommu: Only unlink device domains from iommu
  intel-iommu: Enable super page (2MiB, 1GiB, etc.) support
  intel-iommu: Flush unmaps at domain_exit
  intel-iommu: Remove obsolete comment from detect_intel_iommu
  intel-iommu: fix VT-d PMR disable for TXT on S3 resume
......@@ -999,7 +999,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
With this option on every unmap_single operation will
result in a hardware IOTLB flush operation as opposed
to batching them for performance.
sp_off [Default Off]
By default, super page will be supported if Intel IOMMU
has the capability. With this option, super page will
not be supported.
intremap= [X86-64, Intel-IOMMU]
Format: { on (default) | off | nosid }
on enable Interrupt Remapping (default)
......
......@@ -698,12 +698,7 @@ int __init detect_intel_iommu(void)
{
#ifdef CONFIG_INTR_REMAP
struct acpi_table_dmar *dmar;
/*
* for now we will disable dma-remapping when interrupt
* remapping is enabled.
* When support for queued invalidation for IOTLB invalidation
* is added, we will not need this any more.
*/
dmar = (struct acpi_table_dmar *) dmar_tbl;
if (ret && cpu_has_x2apic && dmar->flags & 0x1)
printk(KERN_INFO
......
......@@ -47,6 +47,8 @@
#define ROOT_SIZE VTD_PAGE_SIZE
#define CONTEXT_SIZE VTD_PAGE_SIZE
#define IS_BRIDGE_HOST_DEVICE(pdev) \
((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
......@@ -116,6 +118,11 @@ static inline unsigned long align_to_level(unsigned long pfn, int level)
return (pfn + level_size(level) - 1) & level_mask(level);
}
static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
{
return 1 << ((lvl - 1) * LEVEL_STRIDE);
}
/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
are never going to work. */
static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
......@@ -142,6 +149,12 @@ static struct intel_iommu **g_iommus;
static void __init check_tylersburg_isoch(void);
static int rwbf_quirk;
/*
* set to 1 to panic kernel if can't successfully enable VT-d
* (used when kernel is launched w/ TXT)
*/
static int force_on = 0;
/*
* 0: Present
* 1-11: Reserved
......@@ -338,6 +351,9 @@ struct dmar_domain {
int iommu_coherency;/* indicate coherency of iommu access */
int iommu_snooping; /* indicate snooping control feature*/
int iommu_count; /* reference count of iommu */
int iommu_superpage;/* Level of superpages supported:
0 == 4KiB (no superpages), 1 == 2MiB,
2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
spinlock_t iommu_lock; /* protect iommu set in domain */
u64 max_addr; /* maximum mapped address */
};
......@@ -387,6 +403,7 @@ int dmar_disabled = 1;
static int dmar_map_gfx = 1;
static int dmar_forcedac;
static int intel_iommu_strict;
static int intel_iommu_superpage = 1;
#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
static DEFINE_SPINLOCK(device_domain_lock);
......@@ -417,6 +434,10 @@ static int __init intel_iommu_setup(char *str)
printk(KERN_INFO
"Intel-IOMMU: disable batched IOTLB flush\n");
intel_iommu_strict = 1;
} else if (!strncmp(str, "sp_off", 6)) {
printk(KERN_INFO
"Intel-IOMMU: disable supported super page\n");
intel_iommu_superpage = 0;
}
str += strcspn(str, ",");
......@@ -555,11 +576,32 @@ static void domain_update_iommu_snooping(struct dmar_domain *domain)
}
}
static void domain_update_iommu_superpage(struct dmar_domain *domain)
{
int i, mask = 0xf;
if (!intel_iommu_superpage) {
domain->iommu_superpage = 0;
return;
}
domain->iommu_superpage = 4; /* 1TiB */
for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
mask |= cap_super_page_val(g_iommus[i]->cap);
if (!mask) {
break;
}
}
domain->iommu_superpage = fls(mask);
}
/* Some capabilities may be different across iommus */
static void domain_update_iommu_cap(struct dmar_domain *domain)
{
domain_update_iommu_coherency(domain);
domain_update_iommu_snooping(domain);
domain_update_iommu_superpage(domain);
}
static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
......@@ -689,23 +731,31 @@ static void free_context_table(struct intel_iommu *iommu)
}
static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
unsigned long pfn)
unsigned long pfn, int large_level)
{
int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
struct dma_pte *parent, *pte = NULL;
int level = agaw_to_level(domain->agaw);
int offset;
int offset, target_level;
BUG_ON(!domain->pgd);
BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
parent = domain->pgd;
/* Search pte */
if (!large_level)
target_level = 1;
else
target_level = large_level;
while (level > 0) {
void *tmp_page;
offset = pfn_level_offset(pfn, level);
pte = &parent[offset];
if (level == 1)
if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE))
break;
if (level == target_level)
break;
if (!dma_pte_present(pte)) {
......@@ -733,10 +783,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
return pte;
}
/* return address's pte at specific level */
static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
unsigned long pfn,
int level)
int level, int *large_page)
{
struct dma_pte *parent, *pte = NULL;
int total = agaw_to_level(domain->agaw);
......@@ -749,8 +800,16 @@ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
if (level == total)
return pte;
if (!dma_pte_present(pte))
if (!dma_pte_present(pte)) {
*large_page = total;
break;
}
if (pte->val & DMA_PTE_LARGE_PAGE) {
*large_page = total;
return pte;
}
parent = phys_to_virt(dma_pte_addr(pte));
total--;
}
......@@ -763,6 +822,7 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
unsigned long last_pfn)
{
int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
unsigned int large_page = 1;
struct dma_pte *first_pte, *pte;
BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
......@@ -771,14 +831,15 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
/* we don't need lock here; nobody else touches the iova range */
do {
first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
large_page = 1;
first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
if (!pte) {
start_pfn = align_to_level(start_pfn + 1, 2);
start_pfn = align_to_level(start_pfn + 1, large_page + 1);
continue;
}
do {
do {
dma_clear_pte(pte);
start_pfn++;
start_pfn += lvl_to_nr_pages(large_page);
pte++;
} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
......@@ -798,6 +859,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
int total = agaw_to_level(domain->agaw);
int level;
unsigned long tmp;
int large_page = 2;
BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
......@@ -813,7 +875,10 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
return;
do {
first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
large_page = level;
first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
if (large_page > level)
level = large_page + 1;
if (!pte) {
tmp = align_to_level(tmp + 1, level + 1);
continue;
......@@ -1397,6 +1462,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
else
domain->iommu_snooping = 0;
domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
domain->iommu_count = 1;
domain->nid = iommu->node;
......@@ -1417,6 +1483,10 @@ static void domain_exit(struct dmar_domain *domain)
if (!domain)
return;
/* Flush any lazy unmaps that may reference this domain */
if (!intel_iommu_strict)
flush_unmaps_timeout(0);
domain_remove_dev_info(domain);
/* destroy iovas */
put_iova_domain(&domain->iovad);
......@@ -1648,6 +1718,34 @@ static inline unsigned long aligned_nrpages(unsigned long host_addr,
return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
}
/* Return largest possible superpage level for a given mapping */
static inline int hardware_largepage_caps(struct dmar_domain *domain,
unsigned long iov_pfn,
unsigned long phy_pfn,
unsigned long pages)
{
int support, level = 1;
unsigned long pfnmerge;
support = domain->iommu_superpage;
/* To use a large page, the virtual *and* physical addresses
must be aligned to 2MiB/1GiB/etc. Lower bits set in either
of them will mean we have to use smaller pages. So just
merge them and check both at once. */
pfnmerge = iov_pfn | phy_pfn;
while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
pages >>= VTD_STRIDE_SHIFT;
if (!pages)
break;
pfnmerge >>= VTD_STRIDE_SHIFT;
level++;
support--;
}
return level;
}
static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
struct scatterlist *sg, unsigned long phys_pfn,
unsigned long nr_pages, int prot)
......@@ -1656,6 +1754,8 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
phys_addr_t uninitialized_var(pteval);
int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
unsigned long sg_res;
unsigned int largepage_lvl = 0;
unsigned long lvl_pages = 0;
BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
......@@ -1671,7 +1771,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
}
while (nr_pages--) {
while (nr_pages > 0) {
uint64_t tmp;
if (!sg_res) {
......@@ -1679,11 +1779,21 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
sg->dma_length = sg->length;
pteval = page_to_phys(sg_page(sg)) | prot;
phys_pfn = pteval >> VTD_PAGE_SHIFT;
}
if (!pte) {
first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
if (!pte)
return -ENOMEM;
/* It is large page*/
if (largepage_lvl > 1)
pteval |= DMA_PTE_LARGE_PAGE;
else
pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
}
/* We don't need lock here, nobody else
* touches the iova range
......@@ -1699,16 +1809,38 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
}
WARN_ON(1);
}
lvl_pages = lvl_to_nr_pages(largepage_lvl);
BUG_ON(nr_pages < lvl_pages);
BUG_ON(sg_res < lvl_pages);
nr_pages -= lvl_pages;
iov_pfn += lvl_pages;
phys_pfn += lvl_pages;
pteval += lvl_pages * VTD_PAGE_SIZE;
sg_res -= lvl_pages;
/* If the next PTE would be the first in a new page, then we
need to flush the cache on the entries we've just written.
And then we'll need to recalculate 'pte', so clear it and
let it get set again in the if (!pte) block above.
If we're done (!nr_pages) we need to flush the cache too.
Also if we've been setting superpages, we may need to
recalculate 'pte' and switch back to smaller pages for the
end of the mapping, if the trailing size is not enough to
use another superpage (i.e. sg_res < lvl_pages). */
pte++;
if (!nr_pages || first_pte_in_page(pte)) {
if (!nr_pages || first_pte_in_page(pte) ||
(largepage_lvl > 1 && sg_res < lvl_pages)) {
domain_flush_cache(domain, first_pte,
(void *)pte - (void *)first_pte);
pte = NULL;
}
iov_pfn++;
pteval += VTD_PAGE_SIZE;
sg_res--;
if (!sg_res)
if (!sg_res && nr_pages)
sg = sg_next(sg);
}
return 0;
......@@ -2016,7 +2148,7 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
return 0;
return iommu_prepare_identity_map(pdev, rmrr->base_address,
rmrr->end_address + 1);
rmrr->end_address);
}
#ifdef CONFIG_DMAR_FLOPPY_WA
......@@ -2030,7 +2162,7 @@ static inline void iommu_prepare_isa(void)
return;
printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
if (ret)
printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
......@@ -2106,10 +2238,10 @@ static int identity_mapping(struct pci_dev *pdev)
if (likely(!iommu_identity_mapping))
return 0;
info = pdev->dev.archdata.iommu;
if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
return (info->domain == si_domain);
list_for_each_entry(info, &si_domain->devices, link)
if (info->dev == pdev)
return 1;
return 0;
}
......@@ -2187,8 +2319,19 @@ static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
* Assume that they will -- if they turn out not to be, then we can
* take them out of the 1:1 domain later.
*/
if (!startup)
return pdev->dma_mask > DMA_BIT_MASK(32);
if (!startup) {
/*
* If the device's dma_mask is less than the system's memory
* size then this is not a candidate for identity mapping.
*/
u64 dma_mask = pdev->dma_mask;
if (pdev->dev.coherent_dma_mask &&
pdev->dev.coherent_dma_mask < dma_mask)
dma_mask = pdev->dev.coherent_dma_mask;
return dma_mask >= dma_get_required_mask(&pdev->dev);
}
return 1;
}
......@@ -2203,6 +2346,9 @@ static int __init iommu_prepare_static_identity_mapping(int hw)
return -EFAULT;
for_each_pci_dev(pdev) {
/* Skip Host/PCI Bridge devices */
if (IS_BRIDGE_HOST_DEVICE(pdev))
continue;
if (iommu_should_identity_map(pdev, 1)) {
printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
hw ? "hardware" : "software", pci_name(pdev));
......@@ -2218,7 +2364,7 @@ static int __init iommu_prepare_static_identity_mapping(int hw)
return 0;
}
static int __init init_dmars(int force_on)
static int __init init_dmars(void)
{
struct dmar_drhd_unit *drhd;
struct dmar_rmrr_unit *rmrr;
......@@ -2592,8 +2738,7 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
iommu = domain_get_iommu(domain);
size = aligned_nrpages(paddr, size);
iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
pdev->dma_mask);
iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
if (!iova)
goto error;
......@@ -3118,7 +3263,17 @@ static int init_iommu_hw(void)
if (iommu->qi)
dmar_reenable_qi(iommu);
for_each_active_iommu(iommu, drhd) {
for_each_iommu(iommu, drhd) {
if (drhd->ignored) {
/*
* we always have to disable PMRs or DMA may fail on
* this device
*/
if (force_on)
iommu_disable_protect_mem_regions(iommu);
continue;
}
iommu_flush_write_buffer(iommu);
iommu_set_root_entry(iommu);
......@@ -3127,7 +3282,8 @@ static int init_iommu_hw(void)
DMA_CCMD_GLOBAL_INVL);
iommu->flush.flush_iotlb(iommu, 0, 0, 0,
DMA_TLB_GLOBAL_FLUSH);
iommu_enable_translation(iommu);
if (iommu_enable_translation(iommu))
return 1;
iommu_disable_protect_mem_regions(iommu);
}
......@@ -3194,7 +3350,10 @@ static void iommu_resume(void)
unsigned long flag;
if (init_iommu_hw()) {
WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
if (force_on)
panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
else
WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
return;
}
......@@ -3271,7 +3430,6 @@ static struct notifier_block device_nb = {
int __init intel_iommu_init(void)
{
int ret = 0;
int force_on = 0;
/* VT-d is required for a TXT/tboot launch, so enforce that */
force_on = tboot_force_iommu();
......@@ -3309,7 +3467,7 @@ int __init intel_iommu_init(void)
init_no_remapping_devices();
ret = init_dmars(force_on);
ret = init_dmars();
if (ret) {
if (force_on)
panic("tboot: Failed to initialize DMARs\n");
......@@ -3380,8 +3538,8 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain,
spin_lock_irqsave(&device_domain_lock, flags);
list_for_each_safe(entry, tmp, &domain->devices) {
info = list_entry(entry, struct device_domain_info, link);
/* No need to compare PCI domain; it has to be the same */
if (info->bus == pdev->bus->number &&
if (info->segment == pci_domain_nr(pdev->bus) &&
info->bus == pdev->bus->number &&
info->devfn == pdev->devfn) {
list_del(&info->link);
list_del(&info->global);
......@@ -3419,10 +3577,13 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain,
domain_update_iommu_cap(domain);
spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
spin_lock_irqsave(&iommu->lock, tmp_flags);
clear_bit(domain->id, iommu->domain_ids);
iommu->domains[domain->id] = NULL;
spin_unlock_irqrestore(&iommu->lock, tmp_flags);
if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
!(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
spin_lock_irqsave(&iommu->lock, tmp_flags);
clear_bit(domain->id, iommu->domain_ids);
iommu->domains[domain->id] = NULL;
spin_unlock_irqrestore(&iommu->lock, tmp_flags);
}
}
spin_unlock_irqrestore(&device_domain_lock, flags);
......@@ -3505,6 +3666,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
domain->iommu_count = 0;
domain->iommu_coherency = 0;
domain->iommu_snooping = 0;
domain->iommu_superpage = 0;
domain->max_addr = 0;
domain->nid = -1;
......@@ -3720,7 +3882,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
struct dma_pte *pte;
u64 phys = 0;
pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
if (pte)
phys = dma_pte_addr(pte);
......
......@@ -63,8 +63,16 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
curr = iovad->cached32_node;
cached_iova = container_of(curr, struct iova, node);
if (free->pfn_lo >= cached_iova->pfn_lo)
iovad->cached32_node = rb_next(&free->node);
if (free->pfn_lo >= cached_iova->pfn_lo) {
struct rb_node *node = rb_next(&free->node);
struct iova *iova = container_of(node, struct iova, node);
/* only cache if it's below 32bit pfn */
if (node && iova->pfn_lo < iovad->dma_32bit_pfn)
iovad->cached32_node = node;
else
iovad->cached32_node = NULL;
}
}
/* Computes the padding size required, to make the
......
......@@ -9,8 +9,12 @@
#define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT)
#define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
#define VTD_STRIDE_SHIFT (9)
#define VTD_STRIDE_MASK (((u64)-1) << VTD_STRIDE_SHIFT)
#define DMA_PTE_READ (1)
#define DMA_PTE_WRITE (2)
#define DMA_PTE_LARGE_PAGE (1 << 7)
#define DMA_PTE_SNP (1 << 11)
#define CONTEXT_TT_MULTI_LEVEL 0
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册