提交 515d01f7 编写于 作者: L Linus Torvalds

Merge tag 'vfio-v3.9-rc1' of git://github.com/awilliam/linux-vfio

Pull VFIO updates from Alex Williamson:
 - Fixes PCIe v1 extended capability support

 - Cleans up read/write access functions

 - Fix Removal test to properly wait until devices are unused

 - Enable pcieport driver usage for non-accessible devices w/in groups

 - Extensions for PCI VGA support

* tag 'vfio-v3.9-rc1' of git://github.com/awilliam/linux-vfio:
  drivers/vfio: remove depends on CONFIG_EXPERIMENTAL
  vfio-pci: Add support for VGA region access
  vfio-pci: Manage user power state transitions
  vfio: whitelist pcieport
  vfio: Protect vfio_dev_present against device_del
  vfio-pci: Cleanup BAR access
  vfio-pci: Cleanup read/write functions
  vfio-pci: Enable PCIe extended capabilities on v1
......@@ -6,3 +6,13 @@ config VFIO_PCI
use of PCI drivers using the VFIO framework.
If you don't know what to do here, say N.
config VFIO_PCI_VGA
bool "VFIO PCI support for VGA devices"
depends on VFIO_PCI && X86 && VGA_ARB
help
Support for VGA extension to VFIO PCI. This exposes an additional
region on VGA devices for accessing legacy VGA addresses used by
BIOS and generic video drivers.
If you don't know what to do here, say N.
......@@ -84,6 +84,11 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
} else
vdev->msix_bar = 0xFF;
#ifdef CONFIG_VFIO_PCI_VGA
if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
vdev->has_vga = true;
#endif
return 0;
}
......@@ -285,6 +290,16 @@ static long vfio_pci_ioctl(void *device_data,
info.flags = VFIO_REGION_INFO_FLAG_READ;
break;
}
case VFIO_PCI_VGA_REGION_INDEX:
if (!vdev->has_vga)
return -EINVAL;
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
info.size = 0xc0000;
info.flags = VFIO_REGION_INFO_FLAG_READ |
VFIO_REGION_INFO_FLAG_WRITE;
break;
default:
return -EINVAL;
}
......@@ -366,52 +381,50 @@ static long vfio_pci_ioctl(void *device_data,
return -ENOTTY;
}
static ssize_t vfio_pci_read(void *device_data, char __user *buf,
size_t count, loff_t *ppos)
static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
size_t count, loff_t *ppos, bool iswrite)
{
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
struct vfio_pci_device *vdev = device_data;
struct pci_dev *pdev = vdev->pdev;
if (index >= VFIO_PCI_NUM_REGIONS)
return -EINVAL;
if (index == VFIO_PCI_CONFIG_REGION_INDEX)
return vfio_pci_config_readwrite(vdev, buf, count, ppos, false);
else if (index == VFIO_PCI_ROM_REGION_INDEX)
return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
return vfio_pci_io_readwrite(vdev, buf, count, ppos, false);
else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM)
return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
switch (index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
case VFIO_PCI_ROM_REGION_INDEX:
if (iswrite)
return -EINVAL;
return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
case VFIO_PCI_VGA_REGION_INDEX:
return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
}
return -EINVAL;
}
static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
size_t count, loff_t *ppos)
static ssize_t vfio_pci_read(void *device_data, char __user *buf,
size_t count, loff_t *ppos)
{
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
struct vfio_pci_device *vdev = device_data;
struct pci_dev *pdev = vdev->pdev;
if (!count)
return 0;
if (index >= VFIO_PCI_NUM_REGIONS)
return -EINVAL;
return vfio_pci_rw(device_data, buf, count, ppos, false);
}
if (index == VFIO_PCI_CONFIG_REGION_INDEX)
return vfio_pci_config_readwrite(vdev, (char __user *)buf,
count, ppos, true);
else if (index == VFIO_PCI_ROM_REGION_INDEX)
return -EINVAL;
else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
return vfio_pci_io_readwrite(vdev, (char __user *)buf,
count, ppos, true);
else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) {
return vfio_pci_mem_readwrite(vdev, (char __user *)buf,
count, ppos, true);
}
static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
size_t count, loff_t *ppos)
{
if (!count)
return 0;
return -EINVAL;
return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
}
static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
......
......@@ -587,12 +587,46 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
return 0;
}
static int vfio_pm_config_write(struct vfio_pci_device *vdev, int pos,
int count, struct perm_bits *perm,
int offset, __le32 val)
{
count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
if (count < 0)
return count;
if (offset == PCI_PM_CTRL) {
pci_power_t state;
switch (le32_to_cpu(val) & PCI_PM_CTRL_STATE_MASK) {
case 0:
state = PCI_D0;
break;
case 1:
state = PCI_D1;
break;
case 2:
state = PCI_D2;
break;
case 3:
state = PCI_D3hot;
break;
}
pci_set_power_state(vdev->pdev, state);
}
return count;
}
/* Permissions for the Power Management capability */
static int __init init_pci_cap_pm_perm(struct perm_bits *perm)
{
if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_PM]))
return -ENOMEM;
perm->writefn = vfio_pm_config_write;
/*
* We always virtualize the next field so we can remove
* capabilities from the chain if we want to.
......@@ -600,10 +634,11 @@ static int __init init_pci_cap_pm_perm(struct perm_bits *perm)
p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
/*
* Power management is defined *per function*,
* so we let the user write this
* Power management is defined *per function*, so we can let
* the user change power state, but we trap and initiate the
* change ourselves, so the state bits are read-only.
*/
p_setd(perm, PCI_PM_CTRL, NO_VIRT, ALL_WRITE);
p_setd(perm, PCI_PM_CTRL, NO_VIRT, ~PCI_PM_CTRL_STATE_MASK);
return 0;
}
......@@ -985,12 +1020,12 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
if (ret)
return pcibios_err_to_errno(ret);
vdev->extended_caps = true;
if ((word & PCI_EXP_FLAGS_VERS) == 1)
return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
else {
vdev->extended_caps = true;
else
return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2;
}
case PCI_CAP_ID_HT:
ret = pci_read_config_byte(pdev, pos + 3, &byte);
if (ret)
......@@ -1501,9 +1536,8 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
return ret;
}
ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev,
char __user *buf, size_t count,
loff_t *ppos, bool iswrite)
ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite)
{
size_t done = 0;
int ret = 0;
......
......@@ -53,6 +53,7 @@ struct vfio_pci_device {
bool reset_works;
bool extended_caps;
bool bardirty;
bool has_vga;
struct pci_saved_state *pci_saved_state;
atomic_t refcnt;
};
......@@ -70,15 +71,15 @@ extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev,
uint32_t flags, unsigned index,
unsigned start, unsigned count, void *data);
extern ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev,
char __user *buf, size_t count,
loff_t *ppos, bool iswrite);
extern ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev,
char __user *buf, size_t count,
loff_t *ppos, bool iswrite);
extern ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev,
char __user *buf, size_t count,
loff_t *ppos, bool iswrite);
extern ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev,
char __user *buf, size_t count,
loff_t *ppos, bool iswrite);
extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite);
extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite);
extern int vfio_pci_init_perm_bits(void);
extern void vfio_pci_uninit_perm_bits(void);
......
......@@ -17,253 +17,222 @@
#include <linux/pci.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/vgaarb.h>
#include "vfio_pci_private.h"
/* I/O Port BAR access */
ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite)
/*
* Read or write from an __iomem region (MMIO or I/O port) with an excluded
* range which is inaccessible. The excluded range drops writes and fills
* reads with -1. This is intended for handling MSI-X vector tables and
* leftover space for ROM BARs.
*/
static ssize_t do_io_rw(void __iomem *io, char __user *buf,
loff_t off, size_t count, size_t x_start,
size_t x_end, bool iswrite)
{
struct pci_dev *pdev = vdev->pdev;
loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
void __iomem *io;
size_t done = 0;
if (!pci_resource_start(pdev, bar))
return -EINVAL;
if (pos + count > pci_resource_len(pdev, bar))
return -EINVAL;
if (!vdev->barmap[bar]) {
int ret;
ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
if (ret)
return ret;
vdev->barmap[bar] = pci_iomap(pdev, bar, 0);
if (!vdev->barmap[bar]) {
pci_release_selected_regions(pdev, 1 << bar);
return -EINVAL;
}
}
io = vdev->barmap[bar];
ssize_t done = 0;
while (count) {
int filled;
size_t fillable, filled;
if (off < x_start)
fillable = min(count, (size_t)(x_start - off));
else if (off >= x_end)
fillable = count;
else
fillable = 0;
if (count >= 3 && !(pos % 4)) {
if (fillable >= 4 && !(off % 4)) {
__le32 val;
if (iswrite) {
if (copy_from_user(&val, buf, 4))
return -EFAULT;
iowrite32(le32_to_cpu(val), io + pos);
iowrite32(le32_to_cpu(val), io + off);
} else {
val = cpu_to_le32(ioread32(io + pos));
val = cpu_to_le32(ioread32(io + off));
if (copy_to_user(buf, &val, 4))
return -EFAULT;
}
filled = 4;
} else if ((pos % 2) == 0 && count >= 2) {
} else if (fillable >= 2 && !(off % 2)) {
__le16 val;
if (iswrite) {
if (copy_from_user(&val, buf, 2))
return -EFAULT;
iowrite16(le16_to_cpu(val), io + pos);
iowrite16(le16_to_cpu(val), io + off);
} else {
val = cpu_to_le16(ioread16(io + pos));
val = cpu_to_le16(ioread16(io + off));
if (copy_to_user(buf, &val, 2))
return -EFAULT;
}
filled = 2;
} else {
} else if (fillable) {
u8 val;
if (iswrite) {
if (copy_from_user(&val, buf, 1))
return -EFAULT;
iowrite8(val, io + pos);
iowrite8(val, io + off);
} else {
val = ioread8(io + pos);
val = ioread8(io + off);
if (copy_to_user(buf, &val, 1))
return -EFAULT;
}
filled = 1;
} else {
/* Fill reads with -1, drop writes */
filled = min(count, (size_t)(x_end - off));
if (!iswrite) {
u8 val = 0xFF;
size_t i;
for (i = 0; i < filled; i++)
if (copy_to_user(buf + i, &val, 1))
return -EFAULT;
}
}
count -= filled;
done += filled;
off += filled;
buf += filled;
pos += filled;
}
*ppos += done;
return done;
}
/*
* MMIO BAR access
* We handle two excluded ranges here as well, if the user tries to read
* the ROM beyond what PCI tells us is available or the MSI-X table region,
* we return 0xFF and writes are dropped.
*/
ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite)
ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite)
{
struct pci_dev *pdev = vdev->pdev;
loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
void __iomem *io;
size_t x_start = 0, x_end = 0;
resource_size_t end;
size_t done = 0;
size_t x_start = 0, x_end = 0; /* excluded range */
void __iomem *io;
ssize_t done;
if (!pci_resource_start(pdev, bar))
return -EINVAL;
end = pci_resource_len(pdev, bar);
if (pos > end)
if (pos >= end)
return -EINVAL;
if (pos == end)
return 0;
if (pos + count > end)
count = end - pos;
count = min(count, (size_t)(end - pos));
if (bar == PCI_ROM_RESOURCE) {
/*
* The ROM can fill less space than the BAR, so we start the
* excluded range at the end of the actual ROM. This makes
* filling large ROM BARs much faster.
*/
io = pci_map_rom(pdev, &x_start);
if (!io)
return -ENOMEM;
x_end = end;
} else {
if (!vdev->barmap[bar]) {
int ret;
ret = pci_request_selected_regions(pdev, 1 << bar,
"vfio");
if (ret)
return ret;
} else if (!vdev->barmap[bar]) {
int ret;
vdev->barmap[bar] = pci_iomap(pdev, bar, 0);
ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
if (ret)
return ret;
if (!vdev->barmap[bar]) {
pci_release_selected_regions(pdev, 1 << bar);
return -EINVAL;
}
io = pci_iomap(pdev, bar, 0);
if (!io) {
pci_release_selected_regions(pdev, 1 << bar);
return -ENOMEM;
}
vdev->barmap[bar] = io;
} else
io = vdev->barmap[bar];
if (bar == vdev->msix_bar) {
x_start = vdev->msix_offset;
x_end = vdev->msix_offset + vdev->msix_size;
}
if (bar == vdev->msix_bar) {
x_start = vdev->msix_offset;
x_end = vdev->msix_offset + vdev->msix_size;
}
if (!io)
return -EINVAL;
while (count) {
size_t fillable, filled;
if (pos < x_start)
fillable = x_start - pos;
else if (pos >= x_end)
fillable = end - pos;
else
fillable = 0;
if (fillable >= 4 && !(pos % 4) && (count >= 4)) {
__le32 val;
if (iswrite) {
if (copy_from_user(&val, buf, 4))
goto out;
iowrite32(le32_to_cpu(val), io + pos);
} else {
val = cpu_to_le32(ioread32(io + pos));
done = do_io_rw(io, buf, pos, count, x_start, x_end, iswrite);
if (copy_to_user(buf, &val, 4))
goto out;
}
if (done >= 0)
*ppos += done;
filled = 4;
} else if (fillable >= 2 && !(pos % 2) && (count >= 2)) {
__le16 val;
if (iswrite) {
if (copy_from_user(&val, buf, 2))
goto out;
iowrite16(le16_to_cpu(val), io + pos);
} else {
val = cpu_to_le16(ioread16(io + pos));
if (copy_to_user(buf, &val, 2))
goto out;
}
filled = 2;
} else if (fillable) {
u8 val;
if (bar == PCI_ROM_RESOURCE)
pci_unmap_rom(pdev, io);
if (iswrite) {
if (copy_from_user(&val, buf, 1))
goto out;
return done;
}
iowrite8(val, io + pos);
} else {
val = ioread8(io + pos);
ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite)
{
int ret;
loff_t off, pos = *ppos & VFIO_PCI_OFFSET_MASK;
void __iomem *iomem = NULL;
unsigned int rsrc;
bool is_ioport;
ssize_t done;
if (!vdev->has_vga)
return -EINVAL;
if (copy_to_user(buf, &val, 1))
goto out;
}
switch (pos) {
case 0xa0000 ... 0xbffff:
count = min(count, (size_t)(0xc0000 - pos));
iomem = ioremap_nocache(0xa0000, 0xbffff - 0xa0000 + 1);
off = pos - 0xa0000;
rsrc = VGA_RSRC_LEGACY_MEM;
is_ioport = false;
break;
case 0x3b0 ... 0x3bb:
count = min(count, (size_t)(0x3bc - pos));
iomem = ioport_map(0x3b0, 0x3bb - 0x3b0 + 1);
off = pos - 0x3b0;
rsrc = VGA_RSRC_LEGACY_IO;
is_ioport = true;
break;
case 0x3c0 ... 0x3df:
count = min(count, (size_t)(0x3e0 - pos));
iomem = ioport_map(0x3c0, 0x3df - 0x3c0 + 1);
off = pos - 0x3c0;
rsrc = VGA_RSRC_LEGACY_IO;
is_ioport = true;
break;
default:
return -EINVAL;
}
filled = 1;
} else {
/* Drop writes, fill reads with FF */
filled = min((size_t)(x_end - pos), count);
if (!iswrite) {
char val = 0xFF;
size_t i;
if (!iomem)
return -ENOMEM;
for (i = 0; i < filled; i++) {
if (put_user(val, buf + i))
goto out;
}
}
ret = vga_get_interruptible(vdev->pdev, rsrc);
if (ret) {
is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
return ret;
}
}
done = do_io_rw(iomem, buf, off, count, 0, 0, iswrite);
count -= filled;
done += filled;
buf += filled;
pos += filled;
}
vga_put(vdev->pdev, rsrc);
*ppos += done;
is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
out:
if (bar == PCI_ROM_RESOURCE)
pci_unmap_rom(pdev, io);
if (done >= 0)
*ppos += done;
return count ? -EFAULT : done;
return done;
}
......@@ -442,7 +442,7 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
* a device. It's not always practical to leave a device within a group
* driverless as it could get re-bound to something unsafe.
*/
static const char * const vfio_driver_whitelist[] = { "pci-stub" };
static const char * const vfio_driver_whitelist[] = { "pci-stub", "pcieport" };
static bool vfio_whitelisted_driver(struct device_driver *drv)
{
......@@ -642,33 +642,16 @@ int vfio_add_group_dev(struct device *dev,
}
EXPORT_SYMBOL_GPL(vfio_add_group_dev);
/* Test whether a struct device is present in our tracking */
static bool vfio_dev_present(struct device *dev)
/* Given a referenced group, check if it contains the device */
static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
{
struct iommu_group *iommu_group;
struct vfio_group *group;
struct vfio_device *device;
iommu_group = iommu_group_get(dev);
if (!iommu_group)
return false;
group = vfio_group_get_from_iommu(iommu_group);
if (!group) {
iommu_group_put(iommu_group);
return false;
}
device = vfio_group_get_device(group, dev);
if (!device) {
vfio_group_put(group);
iommu_group_put(iommu_group);
if (!device)
return false;
}
vfio_device_put(device);
vfio_group_put(group);
iommu_group_put(iommu_group);
return true;
}
......@@ -682,10 +665,18 @@ void *vfio_del_group_dev(struct device *dev)
struct iommu_group *iommu_group = group->iommu_group;
void *device_data = device->device_data;
/*
* The group exists so long as we have a device reference. Get
* a group reference and use it to scan for the device going away.
*/
vfio_group_get(group);
vfio_device_put(device);
/* TODO send a signal to encourage this to be released */
wait_event(vfio.release_q, !vfio_dev_present(dev));
wait_event(vfio.release_q, !vfio_dev_present(group, dev));
vfio_group_put(group);
iommu_group_put(iommu_group);
......
......@@ -303,6 +303,15 @@ enum {
VFIO_PCI_BAR5_REGION_INDEX,
VFIO_PCI_ROM_REGION_INDEX,
VFIO_PCI_CONFIG_REGION_INDEX,
/*
* Expose VGA regions defined for PCI base class 03, subclass 00.
* This includes I/O port ranges 0x3b0 to 0x3bb and 0x3c0 to 0x3df
* as well as the MMIO range 0xa0000 to 0xbffff. Each implemented
* range is found at it's identity mapped offset from the region
* offset, for example 0x3b0 is region_info.offset + 0x3b0. Areas
* between described ranges are unimplemented.
*/
VFIO_PCI_VGA_REGION_INDEX,
VFIO_PCI_NUM_REGIONS
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册