Merge remote-tracking branch 'mst/tags/for_anthony' into staging

pci,virtio This further optimizes MSIX handling in virtio-pci. Also included is pci cleanup by Paolo, and pci device assignment fix by Alex. Signed-off-by: N Michael S. Tsirkin <mst@redhat.com> Signed-off-by: N Anthony Liguori <aliguori@us.ibm.com> * mst/tags/for_anthony: pci-assign: Enable MSIX on device to match guest pci: use constants for devices under the 1B36 device ID, document them ivshmem: use symbolic constant for PCI ID, add to pci-ids.txt virtio-9p: use symbolic constant, add to pci-ids.txt reorganize pci-ids.txt docs: move pci-ids.txt to docs/specs/ vhost: backend masking support vhost: set started flag while start is in progress virtio-net: set/clear vhost_started in reverse order virtio: backend virtqueue notifier masking virtio-pci: cache msix messages kvm: add stub for update msi route msix: add api to access msix message virtio: don't waste irqfds on control vqs

Merge remote-tracking branch 'mst/tags/for_anthony' into staging
pci,virtio This further optimizes MSIX handling in virtio-pci. Also included is pci cleanup by Paolo, and pci device assignment fix by Alex. Signed-off-by: N Michael S. Tsirkin <mst@redhat.com> Signed-off-by: N Anthony Liguori <aliguori@us.ibm.com> * mst/tags/for_anthony: pci-assign: Enable MSIX on device to match guest pci: use constants for devices under the 1B36 device ID, document them ivshmem: use symbolic constant for PCI ID, add to pci-ids.txt virtio-9p: use symbolic constant, add to pci-ids.txt reorganize pci-ids.txt docs: move pci-ids.txt to docs/specs/ vhost: backend masking support vhost: set started flag while start is in progress virtio-net: set/clear vhost_started in reverse order virtio: backend virtqueue notifier masking virtio-pci: cache msix messages kvm: add stub for update msi route msix: add api to access msix message virtio: don't waste irqfds on control vqs
8e9a8681 · Anthony Liguori · 7adef3bc · feb9a2ab · 8e9a8681 · 8e9a8681
19 changed file
--- a/docs/specs/pci-ids.txt
+++ b/docs/specs/pci-ids.txt
+
+PCI IDs for qemu
+================
+
+Red Hat, Inc. donates a part of its device ID range to qemu, to be used for
+virtual devices.  The vendor IDs are 1af4 (formerly Qumranet ID) and 1b36.
+
+Contact Gerd Hoffmann <kraxel@redhat.com> to get a device ID assigned
+for your devices.
+
+1af4 vendor ID
+--------------
+
+The 1000 -> 10ff device ID range is used as follows for virtio-pci devices.
+Note that this allocation separate from the virtio device IDs, which are
+maintained as part of the virtio specification.
+
+1af4:1000  network device
+1af4:1001  block device
+1af4:1002  balloon device
+1af4:1003  console device
+1af4:1004  SCSI host bus adapter device
+1af4:1005  entropy generator device
+1af4:1009  9p filesystem device
+
+1af4:10f0  Available for experimental usage without registration.  Must get
+   to      official ID when the code leaves the test lab (i.e. when seeking
+1af4:10ff  upstream merge or shipping a distro/product) to avoid conflicts.
+
+1af4:1100  Used as PCI Subsystem ID for existing hardware devices emulated
+           by qemu.
+
+1af4:1110  ivshmem device (shared memory, docs/specs/ivshmem_device_spec.txt)
+
+All other device IDs are reserved.
+
+1b36 vendor ID
+--------------
+
+The 0000 -> 00ff device ID range is used as follows for QEMU-specific
+PCI devices (other than virtio):
+
+1b36:0001  PCI-PCI bridge
+1b36:0002  PCI serial port (16550A) adapter (docs/specs/pci-serial.txt)
+1b36:0003  PCI Dual-port 16550A adapter (docs/specs/pci-serial.txt)
+1b36:0004  PCI Quad-port 16550A adapter (docs/specs/pci-serial.txt)
+
+All these devices are documented in docs/specs.
+
+The 0100 device ID is used for the QXL video card device.
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -170,7 +170,7 @@ static void virtio_9p_class_init(ObjectClass *klass, void *data)

    k->init = virtio_9p_init_pci;
    k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
-    k->device_id = 0x1009;
+    k->device_id = PCI_DEVICE_ID_VIRTIO_9P;
    k->revision = VIRTIO_PCI_ABI_VERSION;
    k->class_id = 0x2;
    dc->props = virtio_9p_properties;

--- a/hw/ivshmem.c
+++ b/hw/ivshmem.c
@@ -29,6 +29,9 @@
 #include <sys/mman.h>
 #include <sys/types.h>

+#define PCI_VENDOR_ID_IVSHMEM   PCI_VENDOR_ID_REDHAT_QUMRANET
+#define PCI_DEVICE_ID_IVSHMEM   0x1110
+
 #define IVSHMEM_IOEVENTFD   0
 #define IVSHMEM_MSI     1

@@ -800,8 +803,8 @@ static void ivshmem_class_init(ObjectClass *klass, void *data)

    k->init = pci_ivshmem_init;
    k->exit = pci_ivshmem_uninit;
-    k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
-    k->device_id = 0x1110;
+    k->vendor_id = PCI_VENDOR_ID_IVSHMEM;
+    k->device_id = PCI_DEVICE_ID_IVSHMEM;
    k->class_id = PCI_CLASS_MEMORY_RAM;
    dc->reset = ivshmem_reset;
    dc->props = ivshmem_properties;

--- a/hw/kvm/pci-assign.c
+++ b/hw/kvm/pci-assign.c
@@ -1031,6 +1031,19 @@ static bool assigned_dev_msix_masked(MSIXTableEntry *entry)
    return (entry->ctrl & cpu_to_le32(0x1)) != 0;
 }

+/*
+ * When MSI-X is first enabled the vector table typically has all the
+ * vectors masked, so we can't use that as the obvious test to figure out
+ * how many vectors to initially enable.  Instead we look at the data field
+ * because this is what worked for pci-assign for a long time.  This makes
+ * sure the physical MSI-X state tracks the guest's view, which is important
+ * for some VF/PF and PF/fw communication channels.
+ */
+static bool assigned_dev_msix_skipped(MSIXTableEntry *entry)
+{
+    return !entry->data;
+}
+
 static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
 {
    AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev);
@@ -1041,7 +1054,7 @@ static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)

    /* Get the usable entry number for allocating */
    for (i = 0; i < adev->msix_max; i++, entry++) {
-        if (assigned_dev_msix_masked(entry)) {
+        if (assigned_dev_msix_skipped(entry)) {
            continue;
        }
        entries_nr++;
@@ -1070,7 +1083,7 @@ static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
    for (i = 0; i < adev->msix_max; i++, entry++) {
        adev->msi_virq[i] = -1;

-        if (assigned_dev_msix_masked(entry)) {
+        if (assigned_dev_msix_skipped(entry)) {
            continue;
        }


--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -27,7 +27,7 @@
 #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8)
 #define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8)

-static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
+MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
 {
    uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
    MSIMessage msg;

--- a/hw/pci/msix.h
+++ b/hw/pci/msix.h
@@ -5,6 +5,7 @@
 #include "hw/pci/pci.h"

 void msix_set_message(PCIDevice *dev, int vector, MSIMessage msg);
+MSIMessage msix_get_message(PCIDevice *dev, unsigned int vector);
 int msix_init(PCIDevice *dev, unsigned short nentries,
              MemoryRegion *table_bar, uint8_t table_bar_nr,
              unsigned table_offset, MemoryRegion *pba_bar,

--- a/hw/pci/pci.h
+++ b/hw/pci/pci.h
@@ -77,6 +77,14 @@
 #define PCI_DEVICE_ID_VIRTIO_CONSOLE     0x1003
 #define PCI_DEVICE_ID_VIRTIO_SCSI        0x1004
 #define PCI_DEVICE_ID_VIRTIO_RNG         0x1005
+#define PCI_DEVICE_ID_VIRTIO_9P          0x1009
+
+#define PCI_VENDOR_ID_REDHAT             0x1b36
+#define PCI_DEVICE_ID_REDHAT_BRIDGE      0x0001
+#define PCI_DEVICE_ID_REDHAT_SERIAL      0x0002
+#define PCI_DEVICE_ID_REDHAT_SERIAL2     0x0003
+#define PCI_DEVICE_ID_REDHAT_SERIAL4     0x0004
+#define PCI_DEVICE_ID_REDHAT_QXL         0x0100

 #define FMT_PCIBUS                      PRIx64


--- a/hw/pci_bridge_dev.c
+++ b/hw/pci_bridge_dev.c
@@ -27,10 +27,6 @@
 #include "exec/memory.h"
 #include "pci/pci_bus.h"

-#define REDHAT_PCI_VENDOR_ID 0x1b36
-#define PCI_BRIDGE_DEV_VENDOR_ID REDHAT_PCI_VENDOR_ID
-#define PCI_BRIDGE_DEV_DEVICE_ID 0x1
-
 struct PCIBridgeDev {
    PCIBridge bridge;
    MemoryRegion bar;
@@ -146,8 +142,8 @@ static void pci_bridge_dev_class_init(ObjectClass *klass, void *data)
    k->init = pci_bridge_dev_initfn;
    k->exit = pci_bridge_dev_exitfn;
    k->config_write = pci_bridge_dev_write_config;
-    k->vendor_id = PCI_BRIDGE_DEV_VENDOR_ID;
-    k->device_id = PCI_BRIDGE_DEV_DEVICE_ID;
+    k->vendor_id = PCI_VENDOR_ID_REDHAT;
+    k->device_id = PCI_DEVICE_ID_REDHAT_BRIDGE;
    k->class_id = PCI_CLASS_BRIDGE_PCI;
    k->is_bridge = 1,
    dc->desc = "Standard PCI Bridge";

--- a/hw/serial-pci.c
+++ b/hw/serial-pci.c
@@ -185,8 +185,8 @@ static void serial_pci_class_initfn(ObjectClass *klass, void *data)
    PCIDeviceClass *pc = PCI_DEVICE_CLASS(klass);
    pc->init = serial_pci_init;
    pc->exit = serial_pci_exit;
-    pc->vendor_id = 0x1b36; /* Red Hat */
-    pc->device_id = 0x0002;
+    pc->vendor_id = PCI_VENDOR_ID_REDHAT;
+    pc->device_id = PCI_DEVICE_ID_REDHAT_SERIAL;
    pc->revision = 1;
    pc->class_id = PCI_CLASS_COMMUNICATION_SERIAL;
    dc->vmsd = &vmstate_pci_serial;
@@ -199,8 +199,8 @@ static void multi_2x_serial_pci_class_initfn(ObjectClass *klass, void *data)
    PCIDeviceClass *pc = PCI_DEVICE_CLASS(klass);
    pc->init = multi_serial_pci_init;
    pc->exit = multi_serial_pci_exit;
-    pc->vendor_id = 0x1b36; /* Red Hat */
-    pc->device_id = 0x0003;
+    pc->vendor_id = PCI_VENDOR_ID_REDHAT;
+    pc->device_id = PCI_DEVICE_ID_REDHAT_SERIAL2;
    pc->revision = 1;
    pc->class_id = PCI_CLASS_COMMUNICATION_SERIAL;
    dc->vmsd = &vmstate_pci_multi_serial;
@@ -213,8 +213,8 @@ static void multi_4x_serial_pci_class_initfn(ObjectClass *klass, void *data)
    PCIDeviceClass *pc = PCI_DEVICE_CLASS(klass);
    pc->init = multi_serial_pci_init;
    pc->exit = multi_serial_pci_exit;
-    pc->vendor_id = 0x1b36; /* Red Hat */
-    pc->device_id = 0x0004;
+    pc->vendor_id = PCI_VENDOR_ID_REDHAT;
+    pc->device_id = PCI_DEVICE_ID_REDHAT_SERIAL4;
    pc->revision = 1;
    pc->class_id = PCI_CLASS_COMMUNICATION_SERIAL;
    dc->vmsd = &vmstate_pci_multi_serial;

--- a/hw/vhost.c
+++ b/hw/vhost.c
@@ -612,7 +612,7 @@ static void vhost_log_stop(MemoryListener *listener,
    /* FIXME: implement */
 }

-static int vhost_virtqueue_init(struct vhost_dev *dev,
+static int vhost_virtqueue_start(struct vhost_dev *dev,
                                struct VirtIODevice *vdev,
                                struct vhost_virtqueue *vq,
                                unsigned idx)
@@ -681,16 +681,11 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
        goto fail_kick;
    }

-    file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
-    r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file);
-    if (r) {
-        r = -errno;
-        goto fail_call;
-    }
+    /* Clear and discard previous events if any. */
+    event_notifier_test_and_clear(&vq->masked_notifier);

    return 0;

-fail_call:
 fail_kick:
 fail_alloc:
    cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
@@ -708,7 +703,7 @@ fail_alloc_desc:
    return r;
 }

-static void vhost_virtqueue_cleanup(struct vhost_dev *dev,
+static void vhost_virtqueue_stop(struct vhost_dev *dev,
                                    struct VirtIODevice *vdev,
                                    struct vhost_virtqueue *vq,
                                    unsigned idx)
@@ -746,11 +741,39 @@ static void vhost_eventfd_del(MemoryListener *listener,
 {
 }

+static int vhost_virtqueue_init(struct vhost_dev *dev,
+                                struct vhost_virtqueue *vq, int n)
+{
+    struct vhost_vring_file file = {
+        .index = n,
+    };
+    int r = event_notifier_init(&vq->masked_notifier, 0);
+    if (r < 0) {
+        return r;
+    }
+
+    file.fd = event_notifier_get_fd(&vq->masked_notifier);
+    r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file);
+    if (r) {
+        r = -errno;
+        goto fail_call;
+    }
+    return 0;
+fail_call:
+    event_notifier_cleanup(&vq->masked_notifier);
+    return r;
+}
+
+static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
+{
+    event_notifier_cleanup(&vq->masked_notifier);
+}
+
 int vhost_dev_init(struct vhost_dev *hdev, int devfd, const char *devpath,
                   bool force)
 {
    uint64_t features;
-    int r;
+    int i, r;
    if (devfd >= 0) {
        hdev->control = devfd;
    } else {
@@ -768,6 +791,13 @@ int vhost_dev_init(struct vhost_dev *hdev, int devfd, const char *devpath,
    if (r < 0) {
        goto fail;
    }
+
+    for (i = 0; i < hdev->nvqs; ++i) {
+        r = vhost_virtqueue_init(hdev, hdev->vqs + i, i);
+        if (r < 0) {
+            goto fail_vq;
+        }
+    }
    hdev->features = features;

    hdev->memory_listener = (MemoryListener) {
@@ -795,6 +825,10 @@ int vhost_dev_init(struct vhost_dev *hdev, int devfd, const char *devpath,
    memory_listener_register(&hdev->memory_listener, &address_space_memory);
    hdev->force = force;
    return 0;
+fail_vq:
+    while (--i >= 0) {
+        vhost_virtqueue_cleanup(hdev->vqs + i);
+    }
 fail:
    r = -errno;
    close(hdev->control);
@@ -803,6 +837,10 @@ fail:

 void vhost_dev_cleanup(struct vhost_dev *hdev)
 {
+    int i;
+    for (i = 0; i < hdev->nvqs; ++i) {
+        vhost_virtqueue_cleanup(hdev->vqs + i);
+    }
    memory_listener_unregister(&hdev->memory_listener);
    g_free(hdev->mem);
    g_free(hdev->mem_sections);
@@ -869,17 +907,53 @@ void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
    }
 }

+/* Test and clear event pending status.
+ * Should be called after unmask to avoid losing events.
+ */
+bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
+{
+    struct vhost_virtqueue *vq = hdev->vqs + n;
+    assert(hdev->started);
+    return event_notifier_test_and_clear(&vq->masked_notifier);
+}
+
+/* Mask/unmask events from this vq. */
+void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
+                         bool mask)
+{
+    struct VirtQueue *vvq = virtio_get_queue(vdev, n);
+    int r;
+
+    assert(hdev->started);
+
+    struct vhost_vring_file file = {
+        .index = n,
+    };
+    if (mask) {
+        file.fd = event_notifier_get_fd(&hdev->vqs[n].masked_notifier);
+    } else {
+        file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
+    }
+    r = ioctl(hdev->control, VHOST_SET_VRING_CALL, &file);
+    assert(r >= 0);
+}
+
 /* Host notifiers must be enabled at this point. */
 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
 {
    int i, r;
+
+    hdev->started = true;
+
    if (!vdev->binding->set_guest_notifiers) {
        fprintf(stderr, "binding does not support guest notifiers\n");
        r = -ENOSYS;
        goto fail;
    }

-    r = vdev->binding->set_guest_notifiers(vdev->binding_opaque, true);
+    r = vdev->binding->set_guest_notifiers(vdev->binding_opaque,
+                                           hdev->nvqs,
+                                           true);
    if (r < 0) {
        fprintf(stderr, "Error binding guest notifier: %d\n", -r);
        goto fail_notifiers;
@@ -895,7 +969,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
        goto fail_mem;
    }
    for (i = 0; i < hdev->nvqs; ++i) {
-        r = vhost_virtqueue_init(hdev,
+        r = vhost_virtqueue_start(hdev,
                                 vdev,
                                 hdev->vqs + i,
                                 i);
@@ -916,22 +990,22 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
        }
    }

-    hdev->started = true;
-
    return 0;
 fail_log:
 fail_vq:
    while (--i >= 0) {
-        vhost_virtqueue_cleanup(hdev,
+        vhost_virtqueue_stop(hdev,
                                vdev,
                                hdev->vqs + i,
                                i);
    }
 fail_mem:
 fail_features:
-    vdev->binding->set_guest_notifiers(vdev->binding_opaque, false);
+    vdev->binding->set_guest_notifiers(vdev->binding_opaque, hdev->nvqs, false);
 fail_notifiers:
 fail:
+
+    hdev->started = false;
    return r;
 }

@@ -941,7 +1015,7 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
    int i, r;

    for (i = 0; i < hdev->nvqs; ++i) {
-        vhost_virtqueue_cleanup(hdev,
+        vhost_virtqueue_stop(hdev,
                                vdev,
                                hdev->vqs + i,
                                i);
@@ -950,7 +1024,9 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
        vhost_sync_dirty_bitmap(hdev, &hdev->mem_sections[i],
                                0, (hwaddr)~0x0ull);
    }
-    r = vdev->binding->set_guest_notifiers(vdev->binding_opaque, false);
+    r = vdev->binding->set_guest_notifiers(vdev->binding_opaque,
+                                           hdev->nvqs,
+                                           false);
    if (r < 0) {
        fprintf(stderr, "vhost guest notifier cleanup failed: %d\n", r);
        fflush(stderr);

--- a/hw/vhost.h
+++ b/hw/vhost.h
@@ -18,6 +18,7 @@ struct vhost_virtqueue {
    void *ring;
    unsigned long long ring_phys;
    unsigned ring_size;
+    EventNotifier masked_notifier;
 };

 typedef unsigned long vhost_log_chunk_t;
@@ -53,4 +54,13 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev);
 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev);
 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev);

+/* Test and clear masked event pending status.
+ * Should be called after unmask to avoid losing events.
+ */
+bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n);
+
+/* Mask/unmask events from this vq.
+ */
+void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
+                          bool mask);
 #endif
--- a/hw/vhost_net.c
+++ b/hw/vhost_net.c
@@ -109,6 +109,9 @@ struct vhost_net *vhost_net_init(NetClientState *backend, int devfd,
        (1 << VHOST_NET_F_VIRTIO_NET_HDR);
    net->backend = r;

+    net->dev.nvqs = 2;
+    net->dev.vqs = net->vqs;
+
    r = vhost_dev_init(&net->dev, devfd, "/dev/vhost-net", force);
    if (r < 0) {
        goto fail;
@@ -143,9 +146,6 @@ int vhost_net_start(struct vhost_net *net,
    struct vhost_vring_file file = { };
    int r;

-    net->dev.nvqs = 2;
-    net->dev.vqs = net->vqs;
-
    r = vhost_dev_enable_notifiers(&net->dev, dev);
    if (r < 0) {
        goto fail_notifiers;
@@ -200,6 +200,17 @@ void vhost_net_cleanup(struct vhost_net *net)
    vhost_dev_cleanup(&net->dev);
    g_free(net);
 }
+
+bool vhost_net_virtqueue_pending(VHostNetState *net, int idx)
+{
+    return vhost_virtqueue_pending(&net->dev, idx);
+}
+
+void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev,
+                              int idx, bool mask)
+{
+    vhost_virtqueue_mask(&net->dev, dev, idx, mask);
+}
 #else
 struct vhost_net *vhost_net_init(NetClientState *backend, int devfd,
                                 bool force)
@@ -234,4 +245,14 @@ unsigned vhost_net_get_features(struct vhost_net *net, unsigned features)
 void vhost_net_ack_features(struct vhost_net *net, unsigned features)
 {
 }
+
+bool vhost_net_virtqueue_pending(VHostNetState *net, int idx)
+{
+    return -ENOSYS;
+}
+
+void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev,
+                              int idx, bool mask)
+{
+}
 #endif
--- a/hw/vhost_net.h
+++ b/hw/vhost_net.h
@@ -17,4 +17,7 @@ void vhost_net_cleanup(VHostNetState *net);
 unsigned vhost_net_get_features(VHostNetState *net, unsigned features);
 void vhost_net_ack_features(VHostNetState *net, unsigned features);

+bool vhost_net_virtqueue_pending(VHostNetState *net, int n);
+void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev,
+                              int idx, bool mask);
 #endif
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -126,12 +126,12 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t status)
        if (!vhost_net_query(tap_get_vhost_net(n->nic->nc.peer), &n->vdev)) {
            return;
        }
+        n->vhost_started = 1;
        r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), &n->vdev);
        if (r < 0) {
            error_report("unable to start vhost net: %d: "
                         "falling back on userspace virtio", -r);
-        } else {
-            n->vhost_started = 1;
+            n->vhost_started = 0;
        }
    } else {
        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), &n->vdev);
@@ -1010,6 +1010,22 @@ static NetClientInfo net_virtio_info = {
    .link_status_changed = virtio_net_set_link_status,
 };

+static bool virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx)
+{
+    VirtIONet *n = to_virtio_net(vdev);
+    assert(n->vhost_started);
+    return vhost_net_virtqueue_pending(tap_get_vhost_net(n->nic->nc.peer), idx);
+}
+
+static void virtio_net_guest_notifier_mask(VirtIODevice *vdev, int idx,
+                                           bool mask)
+{
+    VirtIONet *n = to_virtio_net(vdev);
+    assert(n->vhost_started);
+    vhost_net_virtqueue_mask(tap_get_vhost_net(n->nic->nc.peer),
+                             vdev, idx, mask);
+}
+
 VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
                              virtio_net_conf *net)
 {
@@ -1026,6 +1042,8 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
    n->vdev.bad_features = virtio_net_bad_features;
    n->vdev.reset = virtio_net_reset;
    n->vdev.set_status = virtio_net_set_status;
+    n->vdev.guest_notifier_mask = virtio_net_guest_notifier_mask;
+    n->vdev.guest_notifier_pending = virtio_net_guest_notifier_pending;
    n->rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx);

    if (net->tx && strcmp(net->tx, "timer") && strcmp(net->tx, "bh")) {

--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -487,8 +487,6 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
                                        unsigned int vector,
                                        MSIMessage msg)
 {
-    VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
-    EventNotifier *n = virtio_queue_get_guest_notifier(vq);
    VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector];
    int ret;

@@ -500,20 +498,33 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
        irqfd->virq = ret;
    }
    irqfd->users++;
-
-    ret = kvm_irqchip_add_irqfd_notifier(kvm_state, n, irqfd->virq);
-    if (ret < 0) {
-        if (--irqfd->users == 0) {
-            kvm_irqchip_release_virq(kvm_state, irqfd->virq);
-        }
-        return ret;
-    }
    return 0;
 }

 static void kvm_virtio_pci_vq_vector_release(VirtIOPCIProxy *proxy,
-                                             unsigned int queue_no,
                                             unsigned int vector)
+{
+    VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector];
+    if (--irqfd->users == 0) {
+        kvm_irqchip_release_virq(kvm_state, irqfd->virq);
+    }
+}
+
+static int kvm_virtio_pci_irqfd_use(VirtIOPCIProxy *proxy,
+                                 unsigned int queue_no,
+                                 unsigned int vector)
+{
+    VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector];
+    VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
+    EventNotifier *n = virtio_queue_get_guest_notifier(vq);
+    int ret;
+    ret = kvm_irqchip_add_irqfd_notifier(kvm_state, n, irqfd->virq);
+    return ret;
+}
+
+static void kvm_virtio_pci_irqfd_release(VirtIOPCIProxy *proxy,
+                                      unsigned int queue_no,
+                                      unsigned int vector)
 {
    VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
    EventNotifier *n = virtio_queue_get_guest_notifier(vq);
@@ -522,27 +533,143 @@ static void kvm_virtio_pci_vq_vector_release(VirtIOPCIProxy *proxy,

    ret = kvm_irqchip_remove_irqfd_notifier(kvm_state, n, irqfd->virq);
    assert(ret == 0);
+}

-    if (--irqfd->users == 0) {
-        kvm_irqchip_release_virq(kvm_state, irqfd->virq);
+static int kvm_virtio_pci_vector_use(VirtIOPCIProxy *proxy, int nvqs)
+{
+    PCIDevice *dev = &proxy->pci_dev;
+    VirtIODevice *vdev = proxy->vdev;
+    unsigned int vector;
+    int ret, queue_no;
+    MSIMessage msg;
+
+    for (queue_no = 0; queue_no < nvqs; queue_no++) {
+        if (!virtio_queue_get_num(vdev, queue_no)) {
+            break;
+        }
+        vector = virtio_queue_vector(vdev, queue_no);
+        if (vector >= msix_nr_vectors_allocated(dev)) {
+            continue;
+        }
+        msg = msix_get_message(dev, vector);
+        ret = kvm_virtio_pci_vq_vector_use(proxy, queue_no, vector, msg);
+        if (ret < 0) {
+            goto undo;
+        }
+        /* If guest supports masking, set up irqfd now.
+         * Otherwise, delay until unmasked in the frontend.
+         */
+        if (proxy->vdev->guest_notifier_mask) {
+            ret = kvm_virtio_pci_irqfd_use(proxy, queue_no, vector);
+            if (ret < 0) {
+                kvm_virtio_pci_vq_vector_release(proxy, vector);
+                goto undo;
+            }
+        }
+    }
+    return 0;
+
+undo:
+    while (--queue_no >= 0) {
+        vector = virtio_queue_vector(vdev, queue_no);
+        if (vector >= msix_nr_vectors_allocated(dev)) {
+            continue;
+        }
+        if (proxy->vdev->guest_notifier_mask) {
+            kvm_virtio_pci_irqfd_release(proxy, vector, queue_no);
+        }
+        kvm_virtio_pci_vq_vector_release(proxy, vector);
+    }
+    return ret;
+}
+
+static void kvm_virtio_pci_vector_release(VirtIOPCIProxy *proxy, int nvqs)
+{
+    PCIDevice *dev = &proxy->pci_dev;
+    VirtIODevice *vdev = proxy->vdev;
+    unsigned int vector;
+    int queue_no;
+
+    for (queue_no = 0; queue_no < nvqs; queue_no++) {
+        if (!virtio_queue_get_num(vdev, queue_no)) {
+            break;
+        }
+        vector = virtio_queue_vector(vdev, queue_no);
+        if (vector >= msix_nr_vectors_allocated(dev)) {
+            continue;
+        }
+        /* If guest supports masking, clean up irqfd now.
+         * Otherwise, it was cleaned when masked in the frontend.
+         */
+        if (proxy->vdev->guest_notifier_mask) {
+            kvm_virtio_pci_irqfd_release(proxy, vector, queue_no);
+        }
+        kvm_virtio_pci_vq_vector_release(proxy, vector);
+    }
+}
+
+static int kvm_virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
+                                        unsigned int queue_no,
+                                        unsigned int vector,
+                                        MSIMessage msg)
+{
+    VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
+    EventNotifier *n = virtio_queue_get_guest_notifier(vq);
+    VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector];
+    int ret;
+
+    if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
+        ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    /* If guest supports masking, irqfd is already setup, unmask it.
+     * Otherwise, set it up now.
+     */
+    if (proxy->vdev->guest_notifier_mask) {
+        proxy->vdev->guest_notifier_mask(proxy->vdev, queue_no, false);
+        /* Test after unmasking to avoid losing events. */
+        if (proxy->vdev->guest_notifier_pending &&
+            proxy->vdev->guest_notifier_pending(proxy->vdev, queue_no)) {
+            event_notifier_set(n);
+        }
+    } else {
+        ret = kvm_virtio_pci_irqfd_use(proxy, queue_no, vector);
    }
+    return ret;
 }

-static int kvm_virtio_pci_vector_use(PCIDevice *dev, unsigned vector,
+static void kvm_virtio_pci_vq_vector_mask(VirtIOPCIProxy *proxy,
+                                             unsigned int queue_no,
+                                             unsigned int vector)
+{
+    /* If guest supports masking, keep irqfd but mask it.
+     * Otherwise, clean it up now.
+     */ 
+    if (proxy->vdev->guest_notifier_mask) {
+        proxy->vdev->guest_notifier_mask(proxy->vdev, queue_no, true);
+    } else {
+        kvm_virtio_pci_irqfd_release(proxy, vector, queue_no);
+    }
+}
+
+static int kvm_virtio_pci_vector_unmask(PCIDevice *dev, unsigned vector,
                                     MSIMessage msg)
 {
    VirtIOPCIProxy *proxy = container_of(dev, VirtIOPCIProxy, pci_dev);
    VirtIODevice *vdev = proxy->vdev;
    int ret, queue_no;

-    for (queue_no = 0; queue_no < VIRTIO_PCI_QUEUE_MAX; queue_no++) {
+    for (queue_no = 0; queue_no < proxy->nvqs_with_notifiers; queue_no++) {
        if (!virtio_queue_get_num(vdev, queue_no)) {
            break;
        }
        if (virtio_queue_vector(vdev, queue_no) != vector) {
            continue;
        }
-        ret = kvm_virtio_pci_vq_vector_use(proxy, queue_no, vector, msg);
+        ret = kvm_virtio_pci_vq_vector_unmask(proxy, queue_no, vector, msg);
        if (ret < 0) {
            goto undo;
        }
@@ -554,25 +681,25 @@ undo:
        if (virtio_queue_vector(vdev, queue_no) != vector) {
            continue;
        }
-        kvm_virtio_pci_vq_vector_release(proxy, queue_no, vector);
+        kvm_virtio_pci_vq_vector_mask(proxy, queue_no, vector);
    }
    return ret;
 }

-static void kvm_virtio_pci_vector_release(PCIDevice *dev, unsigned vector)
+static void kvm_virtio_pci_vector_mask(PCIDevice *dev, unsigned vector)
 {
    VirtIOPCIProxy *proxy = container_of(dev, VirtIOPCIProxy, pci_dev);
    VirtIODevice *vdev = proxy->vdev;
    int queue_no;

-    for (queue_no = 0; queue_no < VIRTIO_PCI_QUEUE_MAX; queue_no++) {
+    for (queue_no = 0; queue_no < proxy->nvqs_with_notifiers; queue_no++) {
        if (!virtio_queue_get_num(vdev, queue_no)) {
            break;
        }
        if (virtio_queue_vector(vdev, queue_no) != vector) {
            continue;
        }
-        kvm_virtio_pci_vq_vector_release(proxy, queue_no, vector);
+        kvm_virtio_pci_vq_vector_mask(proxy, queue_no, vector);
    }
 }

@@ -587,7 +714,7 @@ static void kvm_virtio_pci_vector_poll(PCIDevice *dev,
    EventNotifier *notifier;
    VirtQueue *vq;

-    for (queue_no = 0; queue_no < VIRTIO_PCI_QUEUE_MAX; queue_no++) {
+    for (queue_no = 0; queue_no < proxy->nvqs_with_notifiers; queue_no++) {
        if (!virtio_queue_get_num(vdev, queue_no)) {
            break;
        }
@@ -598,7 +725,11 @@ static void kvm_virtio_pci_vector_poll(PCIDevice *dev,
        }
        vq = virtio_get_queue(vdev, queue_no);
        notifier = virtio_queue_get_guest_notifier(vq);
-        if (event_notifier_test_and_clear(notifier)) {
+        if (vdev->guest_notifier_pending) {
+            if (vdev->guest_notifier_pending(vdev, queue_no)) {
+                msix_set_pending(dev, vector);
+            }
+        } else if (event_notifier_test_and_clear(notifier)) {
            msix_set_pending(dev, vector);
        }
    }
@@ -631,7 +762,7 @@ static bool virtio_pci_query_guest_notifiers(DeviceState *d)
    return msix_enabled(&proxy->pci_dev);
 }

-static int virtio_pci_set_guest_notifiers(DeviceState *d, bool assign)
+static int virtio_pci_set_guest_notifiers(DeviceState *d, int nvqs, bool assign)
 {
    VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
    VirtIODevice *vdev = proxy->vdev;
@@ -639,14 +770,24 @@ static int virtio_pci_set_guest_notifiers(DeviceState *d, bool assign)
    bool with_irqfd = msix_enabled(&proxy->pci_dev) &&
        kvm_msi_via_irqfd_enabled();

+    nvqs = MIN(nvqs, VIRTIO_PCI_QUEUE_MAX);
+
+    /* When deassigning, pass a consistent nvqs value
+     * to avoid leaking notifiers.
+     */
+    assert(assign || nvqs == proxy->nvqs_with_notifiers);
+
+    proxy->nvqs_with_notifiers = nvqs;
+
    /* Must unset vector notifier while guest notifier is still assigned */
    if (proxy->vector_irqfd && !assign) {
        msix_unset_vector_notifiers(&proxy->pci_dev);
+        kvm_virtio_pci_vector_release(proxy, nvqs);
        g_free(proxy->vector_irqfd);
        proxy->vector_irqfd = NULL;
    }

-    for (n = 0; n < VIRTIO_PCI_QUEUE_MAX; n++) {
+    for (n = 0; n < nvqs; n++) {
        if (!virtio_queue_get_num(vdev, n)) {
            break;
        }
@@ -663,17 +804,25 @@ static int virtio_pci_set_guest_notifiers(DeviceState *d, bool assign)
        proxy->vector_irqfd =
            g_malloc0(sizeof(*proxy->vector_irqfd) *
                      msix_nr_vectors_allocated(&proxy->pci_dev));
+        r = kvm_virtio_pci_vector_use(proxy, nvqs);
+        if (r < 0) {
+            goto assign_error;
+        }
        r = msix_set_vector_notifiers(&proxy->pci_dev,
-                                      kvm_virtio_pci_vector_use,
-                                      kvm_virtio_pci_vector_release,
+                                      kvm_virtio_pci_vector_unmask,
+                                      kvm_virtio_pci_vector_mask,
                                      kvm_virtio_pci_vector_poll);
        if (r < 0) {
-            goto assign_error;
+            goto notifiers_error;
        }
    }

    return 0;

+notifiers_error:
+    assert(assign);
+    kvm_virtio_pci_vector_release(proxy, nvqs);
+
 assign_error:
    /* We get here on assignment failure. Recover by undoing for VQs 0 .. n. */
    assert(assign);

--- a/hw/virtio-pci.h
+++ b/hw/virtio-pci.h
@@ -27,6 +27,7 @@
 #define VIRTIO_PCI_FLAG_USE_IOEVENTFD   (1 << VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT)

 typedef struct {
+    MSIMessage msg;
    int virq;
    unsigned int users;
 } VirtIOIRQFD;
@@ -51,6 +52,7 @@ typedef struct {
    bool ioeventfd_disabled;
    bool ioeventfd_started;
    VirtIOIRQFD *vector_irqfd;
+    int nvqs_with_notifiers;
 } VirtIOPCIProxy;

 void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev);

--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -99,7 +99,7 @@ typedef struct {
    int (*load_done)(DeviceState *d, QEMUFile *f);
    unsigned (*get_features)(DeviceState *d);
    bool (*query_guest_notifiers)(DeviceState *d);
-    int (*set_guest_notifiers)(DeviceState *d, bool assigned);
+    int (*set_guest_notifiers)(DeviceState *d, int nvqs, bool assigned);
    int (*set_host_notifier)(DeviceState *d, int n, bool assigned);
    void (*vmstate_change)(DeviceState *d, bool running);
 } VirtIOBindings;
@@ -126,6 +126,19 @@ struct VirtIODevice
    void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
    void (*reset)(VirtIODevice *vdev);
    void (*set_status)(VirtIODevice *vdev, uint8_t val);
+    /* Test and clear event pending status.
+     * Should be called after unmask to avoid losing events.
+     * If backend does not support masking,
+     * must check in frontend instead.
+     */
+    bool (*guest_notifier_pending)(VirtIODevice *vdev, int n);
+    /* Mask/unmask events from this vq. Any events reported
+     * while masked will become pending.
+     * If backend does not support masking,
+     * must mask in frontend instead.
+     */
+    void (*guest_notifier_mask)(VirtIODevice *vdev, int n, bool mask);
+
    VirtQueue *vq;
    const VirtIOBindings *binding;
    DeviceState *binding_opaque;

--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -131,6 +131,11 @@ void kvm_irqchip_release_virq(KVMState *s, int virq)
 {
 }

+int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
+{
+    return -ENOSYS;
+}
+
 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
 {
    return -ENOSYS;

--- a/pci-ids.txt
+++ b/pci-ids.txt
-
-PCI IDs for qemu
-================
-
-Red Hat, Inc. donates a part of its device ID range to qemu, to be used for
-virtual devices.  The vendor ID is 1af4 (formerly Qumranet ID).
-
-The 1000 -> 10ff device ID range is used for VirtIO devices.
-
-The 1100 device ID is used as PCI Subsystem ID for existing hardware
-devices emulated by qemu.
-
-All other device IDs are reserved.
-
-
-VirtIO Device IDs
-----------------
-
-1af4:1000  network device
-1af4:1001  block device
-1af4:1002  balloon device
-1af4:1003  console device
-
-1af4:1004  Reserved.
-   to      Contact Gerd Hoffmann <kraxel@redhat.com> to get a
-1af4:10ef  device ID assigned for your new virtio device.
-
-1af4:10f0  Available for experimental usage without registration.  Must get
-   to      official ID when the code leaves the test lab (i.e. when seeking
-1af4:10ff  upstream merge or shipping a distro/product) to avoid conflicts.
-