Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging

virtio, pci: fixes, features virtio is using region caches for performance iommu support for IOTLBs misc fixes Signed-off-by: N Michael S. Tsirkin <mst@redhat.com> # gpg: Signature made Fri 17 Feb 2017 19:53:02 GMT # gpg: using RSA key 0x281F0DB8D28D5469 # gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" # gpg: aka "Michael S. Tsirkin <mst@redhat.com>" # Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67 # Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469 * remotes/mst/tags/for_upstream: (23 commits) intel_iommu: vtd_slpt_level_shift check level intel_iommu: convert dbg macros to trace for trans intel_iommu: convert dbg macros to traces for inv intel_iommu: renaming gpa to iova where proper intel_iommu: simplify irq region translation intel_iommu: add "caching-mode" option vfio: allow to notify unmap for very large region vfio: introduce vfio_get_vaddr() vfio: trace map/unmap for notify as well pcie: simplify pcie_add_capability() virtio: Fix no interrupt when not creating msi controller virtio: use VRingMemoryRegionCaches for avail and used rings virtio: check for vring setup in virtio_queue_update_used_idx virtio: use VRingMemoryRegionCaches for descriptor ring virtio: add MemoryListener to cache ring translations virtio: use MemoryRegionCache to access descriptors exec: make address_space_cache_destroy idempotent virtio: use address_space_map/unmap to access descriptors virtio: add virtio_*_phys_cached memory: make memory_listener_unregister idempotent ... Signed-off-by: N Peter Maydell <peter.maydell@linaro.org>

Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging
virtio, pci: fixes, features virtio is using region caches for performance iommu support for IOTLBs misc fixes Signed-off-by: N Michael S. Tsirkin <mst@redhat.com> # gpg: Signature made Fri 17 Feb 2017 19:53:02 GMT # gpg: using RSA key 0x281F0DB8D28D5469 # gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" # gpg: aka "Michael S. Tsirkin <mst@redhat.com>" # Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67 # Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469 * remotes/mst/tags/for_upstream: (23 commits) intel_iommu: vtd_slpt_level_shift check level intel_iommu: convert dbg macros to trace for trans intel_iommu: convert dbg macros to traces for inv intel_iommu: renaming gpa to iova where proper intel_iommu: simplify irq region translation intel_iommu: add "caching-mode" option vfio: allow to notify unmap for very large region vfio: introduce vfio_get_vaddr() vfio: trace map/unmap for notify as well pcie: simplify pcie_add_capability() virtio: Fix no interrupt when not creating msi controller virtio: use VRingMemoryRegionCaches for avail and used rings virtio: check for vring setup in virtio_queue_update_used_idx virtio: use VRingMemoryRegionCaches for descriptor ring virtio: add MemoryListener to cache ring translations virtio: use MemoryRegionCache to access descriptors exec: make address_space_cache_destroy idempotent virtio: use address_space_map/unmap to access descriptors virtio: add virtio_*_phys_cached memory: make memory_listener_unregister idempotent ... Signed-off-by: N Peter Maydell <peter.maydell@linaro.org>
d514cfd7 · Peter Maydell · ad584d37 · 7e58326a · d514cfd7 · d514cfd7
21 changed file
--- a/docs/nvdimm.txt
+++ b/docs/nvdimm.txt
+QEMU Virtual NVDIMM
+===================
+
+This document explains the usage of virtual NVDIMM (vNVDIMM) feature
+which is available since QEMU v2.6.0.
+
+The current QEMU only implements the persistent memory mode of vNVDIMM
+device and not the block window mode.
+
+Basic Usage
+-----------
+
+The storage of a vNVDIMM device in QEMU is provided by the memory
+backend (i.e. memory-backend-file and memory-backend-ram). A simple
+way to create a vNVDIMM device at startup time is done via the
+following command line options:
+
+ -machine pc,nvdimm
+ -m $RAM_SIZE,slots=$N,maxmem=$MAX_SIZE
+ -object memory-backend-file,id=mem1,share=on,mem-path=$PATH,size=$NVDIMM_SIZE
+ -device nvdimm,id=nvdimm1,memdev=mem1
+
+Where,
+
+ - the "nvdimm" machine option enables vNVDIMM feature.
+
+ - "slots=$N" should be equal to or larger than the total amount of
+   normal RAM devices and vNVDIMM devices, e.g. $N should be >= 2 here.
+
+ - "maxmem=$MAX_SIZE" should be equal to or larger than the total size
+   of normal RAM devices and vNVDIMM devices, e.g. $MAX_SIZE should be
+   >= $RAM_SIZE + $NVDIMM_SIZE here.
+
+ - "object memory-backend-file,id=mem1,share=on,mem-path=$PATH,size=$NVDIMM_SIZE"
+   creates a backend storage of size $NVDIMM_SIZE on a file $PATH. All
+   accesses to the virtual NVDIMM device go to the file $PATH.
+
+   "share=on/off" controls the visibility of guest writes. If
+   "share=on", then guest writes will be applied to the backend
+   file. If another guest uses the same backend file with option
+   "share=on", then above writes will be visible to it as well. If
+   "share=off", then guest writes won't be applied to the backend
+   file and thus will be invisible to other guests.
+
+ - "device nvdimm,id=nvdimm1,memdev=mem1" creates a virtual NVDIMM
+   device whose storage is provided by above memory backend device.
+
+Multiple vNVDIMM devices can be created if multiple pairs of "-object"
+and "-device" are provided.
+
+For above command line options, if the guest OS has the proper NVDIMM
+driver, it should be able to detect a NVDIMM device which is in the
+persistent memory mode and whose size is $NVDIMM_SIZE.
+
+Note:
+
+1. Prior to QEMU v2.8.0, if memory-backend-file is used and the actual
+   backend file size is not equal to the size given by "size" option,
+   QEMU will truncate the backend file by ftruncate(2), which will
+   corrupt the existing data in the backend file, especially for the
+   shrink case.
+
+   QEMU v2.8.0 and later check the backend file size and the "size"
+   option. If they do not match, QEMU will report errors and abort in
+   order to avoid the data corruption.
+
+2. QEMU v2.6.0 only puts a basic alignment requirement on the "size"
+   option of memory-backend-file, e.g. 4KB alignment on x86.  However,
+   QEMU v.2.7.0 puts an additional alignment requirement, which may
+   require a larger value than the basic one, e.g. 2MB on x86. This
+   change breaks the usage of memory-backend-file that only satisfies
+   the basic alignment.
+
+   QEMU v2.8.0 and later remove the additional alignment on non-s390x
+   architectures, so the broken memory-backend-file can work again.
+
+Label
+-----
+
+QEMU v2.7.0 and later implement the label support for vNVDIMM devices.
+To enable label on vNVDIMM devices, users can simply add
+"label-size=$SZ" option to "-device nvdimm", e.g.
+
+ -device nvdimm,id=nvdimm1,memdev=mem1,label-size=128K
+
+Note:
+
+1. The minimal label size is 128KB.
+
+2. QEMU v2.7.0 and later store labels at the end of backend storage.
+   If a memory backend file, which was previously used as the backend
+   of a vNVDIMM device without labels, is now used for a vNVDIMM
+   device with label, the data in the label area at the end of file
+   will be inaccessible to the guest. If any useful data (e.g. the
+   meta-data of the file system) was stored there, the latter usage
+   may result guest data corruption (e.g. breakage of guest file
+   system).
+
+Hotplug
+-------
+
+QEMU v2.8.0 and later implement the hotplug support for vNVDIMM
+devices. Similarly to the RAM hotplug, the vNVDIMM hotplug is
+accomplished by two monitor commands "object_add" and "device_add".
+
+For example, the following commands add another 4GB vNVDIMM device to
+the guest:
+
+ (qemu) object_add memory-backend-file,id=mem2,share=on,mem-path=new_nvdimm.img,size=4G
+ (qemu) device_add nvdimm,id=nvdimm2,memdev=mem2
+
+Note:
+
+1. Each hotplugged vNVDIMM device consumes one memory slot. Users
+   should always ensure the memory option "-m ...,slots=N" specifies
+   enough number of slots, i.e.
+     N >= number of RAM devices +
+          number of statically plugged vNVDIMM devices +
+          number of hotplugged vNVDIMM devices
+
+2. The similar is required for the memory option "-m ...,maxmem=M", i.e.
+     M >= size of RAM devices +
+          size of statically plugged vNVDIMM devices +
+          size of hotplugged vNVDIMM devices
--- a/exec.c
+++ b/exec.c
@@ -3166,6 +3166,7 @@ void address_space_cache_destroy(MemoryRegionCache *cache)
        xen_invalidate_map_cache_entry(cache->ptr);
    }
    memory_region_unref(cache->mr);
+    cache->mr = NULL;
 }

 /* Called from RCU critical section.  This function has the same

--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -147,7 +147,7 @@ void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
    g_free(s);
 }

-static void virtio_blk_data_plane_handle_output(VirtIODevice *vdev,
+static bool virtio_blk_data_plane_handle_output(VirtIODevice *vdev,
                                                VirtQueue *vq)
 {
    VirtIOBlock *s = (VirtIOBlock *)vdev;
@@ -155,7 +155,7 @@ static void virtio_blk_data_plane_handle_output(VirtIODevice *vdev,
    assert(s->dataplane);
    assert(s->dataplane_started);

-    virtio_blk_handle_vq(s, vq);
+    return virtio_blk_handle_vq(s, vq);
 }

 /* Context: QEMU global mutex held */

--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -581,10 +581,11 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
    return 0;
 }

-void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
+bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
 {
    VirtIOBlockReq *req;
    MultiReqBuffer mrb = {};
+    bool progress = false;

    blk_io_plug(s->blk);

@@ -592,6 +593,7 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
        virtio_queue_set_notification(vq, 0);

        while ((req = virtio_blk_get_request(s, vq))) {
+            progress = true;
            if (virtio_blk_handle_request(req, &mrb)) {
                virtqueue_detach_element(req->vq, &req->elem, 0);
                virtio_blk_free_request(req);
@@ -607,6 +609,12 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
    }

    blk_io_unplug(s->blk);
+    return progress;
+}
+
+static void virtio_blk_handle_output_do(VirtIOBlock *s, VirtQueue *vq)
+{
+    virtio_blk_handle_vq(s, vq);
 }

 static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
@@ -622,7 +630,7 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
            return;
        }
    }
-    virtio_blk_handle_vq(s, vq);
+    virtio_blk_handle_output_do(s, vq);
 }

 static void virtio_blk_dma_restart_bh(void *opaque)

--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -202,6 +202,7 @@
 #define VTD_CAP_MAMV                (VTD_MAMV << 48)
 #define VTD_CAP_PSI                 (1ULL << 39)
 #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
+#define VTD_CAP_CM                  (1ULL << 7)

 /* Supported Adjusted Guest Address Widths */
 #define VTD_CAP_SAGAW_SHIFT         8

--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -3,6 +3,34 @@
 # hw/i386/x86-iommu.c
 x86_iommu_iec_notify(bool global, uint32_t index, uint32_t mask) "Notify IEC invalidation: global=%d index=%" PRIu32 " mask=%" PRIu32

+# hw/i386/intel_iommu.c
+vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
+vtd_inv_desc(const char *type, uint64_t hi, uint64_t lo) "invalidate desc type %s high 0x%"PRIx64" low 0x%"PRIx64
+vtd_inv_desc_invalid(uint64_t hi, uint64_t lo) "invalid inv desc hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_inv_desc_cc_domain(uint16_t domain) "context invalidate domain 0x%"PRIx16
+vtd_inv_desc_cc_global(void) "context invalidate globally"
+vtd_inv_desc_cc_device(uint8_t bus, uint8_t dev, uint8_t fn) "context invalidate device %02"PRIx8":%02"PRIx8".%02"PRIx8
+vtd_inv_desc_cc_devices(uint16_t sid, uint16_t fmask) "context invalidate devices sid 0x%"PRIx16" fmask 0x%"PRIx16
+vtd_inv_desc_cc_invalid(uint64_t hi, uint64_t lo) "invalid context-cache desc hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_inv_desc_iotlb_global(void) "iotlb invalidate global"
+vtd_inv_desc_iotlb_domain(uint16_t domain) "iotlb invalidate whole domain 0x%"PRIx16
+vtd_inv_desc_iotlb_pages(uint16_t domain, uint64_t addr, uint8_t mask) "iotlb invalidate domain 0x%"PRIx16" addr 0x%"PRIx64" mask 0x%"PRIx8
+vtd_inv_desc_iotlb_invalid(uint64_t hi, uint64_t lo) "invalid iotlb desc hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_inv_desc_wait_sw(uint64_t addr, uint32_t data) "wait invalidate status write addr 0x%"PRIx64" data 0x%"PRIx32
+vtd_inv_desc_wait_irq(const char *msg) "%s"
+vtd_inv_desc_wait_invalid(uint64_t hi, uint64_t lo) "invalid wait desc hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_inv_desc_wait_write_fail(uint64_t hi, uint64_t lo) "write fail for wait desc hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present"
+vtd_re_invalid(uint64_t hi, uint64_t lo) "invalid root entry hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" devfn %"PRIu8" not present"
+vtd_ce_invalid(uint64_t hi, uint64_t lo) "invalid context entry hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
+vtd_iotlb_page_update(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page update sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
+vtd_iotlb_cc_hit(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen) "IOTLB context hit bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32
+vtd_iotlb_cc_update(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen1, uint32_t gen2) "IOTLB context update bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32
+vtd_iotlb_reset(const char *reason) "IOTLB reset (reason: %s)"
+vtd_fault_disabled(void) "Fault processing disabled for context entry"
+
 # hw/i386/amd_iommu.c
 amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" +  offset 0x%"PRIx32
 amdvi_cache_update(uint16_t domid, uint8_t bus, uint8_t slot, uint8_t func, uint64_t gpa, uint64_t txaddr) " update iotlb domid 0x%"PRIx16" devid: %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64

--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1130,7 +1130,8 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
    return 0;
 }

-static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t size)
+static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
+                                      size_t size)
 {
    VirtIONet *n = qemu_get_nic_opaque(nc);
    VirtIONetQueue *q = virtio_net_get_subqueue(nc);
@@ -1233,6 +1234,17 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t
    return size;
 }

+static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf,
+                                  size_t size)
+{
+    ssize_t r;
+
+    rcu_read_lock();
+    r = virtio_net_receive_rcu(nc, buf, size);
+    rcu_read_unlock();
+    return r;
+}
+
 static int32_t virtio_net_flush_tx(VirtIONetQueue *q);

 static void virtio_net_tx_complete(NetClientState *nc, ssize_t len)

--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -610,7 +610,8 @@ bool pcie_cap_is_arifwd_enabled(const PCIDevice *dev)
 * uint16_t ext_cap_size
 */

-static uint16_t pcie_find_capability_list(PCIDevice *dev, uint16_t cap_id,
+/* Passing a cap_id value > 0xffff will return 0 and put end of list in prev */
+static uint16_t pcie_find_capability_list(PCIDevice *dev, uint32_t cap_id,
                                          uint16_t *prev_p)
 {
    uint16_t prev = 0;
@@ -664,30 +665,24 @@ void pcie_add_capability(PCIDevice *dev,
                         uint16_t cap_id, uint8_t cap_ver,
                         uint16_t offset, uint16_t size)
 {
-    uint32_t header;
-    uint16_t next;
-
    assert(offset >= PCI_CONFIG_SPACE_SIZE);
    assert(offset < offset + size);
    assert(offset + size <= PCIE_CONFIG_SPACE_SIZE);
    assert(size >= 8);
    assert(pci_is_express(dev));

-    if (offset == PCI_CONFIG_SPACE_SIZE) {
-        header = pci_get_long(dev->config + offset);
-        next = PCI_EXT_CAP_NEXT(header);
-    } else {
+    if (offset != PCI_CONFIG_SPACE_SIZE) {
        uint16_t prev;

-        /* 0 is reserved cap id. use internally to find the last capability
-           in the linked list */
-        next = pcie_find_capability_list(dev, 0, &prev);
-
+        /*
+         * 0xffffffff is not a valid cap id (it's a 16 bit field). use
+         * internally to find the last capability in the linked list.
+         */
+        pcie_find_capability_list(dev, 0xffffffff, &prev);
        assert(prev >= PCI_CONFIG_SPACE_SIZE);
-        assert(next == 0);
        pcie_ext_cap_set_next(dev, prev, offset);
    }
-    pci_set_long(dev->config + offset, PCI_EXT_CAP(cap_id, cap_ver, next));
+    pci_set_long(dev->config + offset, PCI_EXT_CAP(cap_id, cap_ver, 0));

    /* Make capability read-only by default */
    memset(dev->wmask + offset, 0, size);

--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -49,35 +49,35 @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
    }
 }

-static void virtio_scsi_data_plane_handle_cmd(VirtIODevice *vdev,
+static bool virtio_scsi_data_plane_handle_cmd(VirtIODevice *vdev,
                                              VirtQueue *vq)
 {
    VirtIOSCSI *s = (VirtIOSCSI *)vdev;

    assert(s->ctx && s->dataplane_started);
-    virtio_scsi_handle_cmd_vq(s, vq);
+    return virtio_scsi_handle_cmd_vq(s, vq);
 }

-static void virtio_scsi_data_plane_handle_ctrl(VirtIODevice *vdev,
+static bool virtio_scsi_data_plane_handle_ctrl(VirtIODevice *vdev,
                                               VirtQueue *vq)
 {
    VirtIOSCSI *s = VIRTIO_SCSI(vdev);

    assert(s->ctx && s->dataplane_started);
-    virtio_scsi_handle_ctrl_vq(s, vq);
+    return virtio_scsi_handle_ctrl_vq(s, vq);
 }

-static void virtio_scsi_data_plane_handle_event(VirtIODevice *vdev,
+static bool virtio_scsi_data_plane_handle_event(VirtIODevice *vdev,
                                                VirtQueue *vq)
 {
    VirtIOSCSI *s = VIRTIO_SCSI(vdev);

    assert(s->ctx && s->dataplane_started);
-    virtio_scsi_handle_event_vq(s, vq);
+    return virtio_scsi_handle_event_vq(s, vq);
 }

 static int virtio_scsi_vring_init(VirtIOSCSI *s, VirtQueue *vq, int n,
-                                  void (*fn)(VirtIODevice *vdev, VirtQueue *vq))
+                                  VirtIOHandleAIOOutput fn)
 {
    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s)));
    int rc;

--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -436,13 +436,16 @@ static inline void virtio_scsi_release(VirtIOSCSI *s)
    }
 }

-void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
+bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
 {
    VirtIOSCSIReq *req;
+    bool progress = false;

    while ((req = virtio_scsi_pop_req(s, vq))) {
+        progress = true;
        virtio_scsi_handle_ctrl_req(s, req);
    }
+    return progress;
 }

 static void virtio_scsi_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
@@ -591,10 +594,11 @@ static void virtio_scsi_handle_cmd_req_submit(VirtIOSCSI *s, VirtIOSCSIReq *req)
    scsi_req_unref(sreq);
 }

-void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
+bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
 {
    VirtIOSCSIReq *req, *next;
    int ret = 0;
+    bool progress = false;

    QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);

@@ -602,6 +606,7 @@ void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
        virtio_queue_set_notification(vq, 0);

        while ((req = virtio_scsi_pop_req(s, vq))) {
+            progress = true;
            ret = virtio_scsi_handle_cmd_req_prepare(s, req);
            if (!ret) {
                QTAILQ_INSERT_TAIL(&reqs, req, next);
@@ -624,6 +629,7 @@ void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
    QTAILQ_FOREACH_SAFE(req, &reqs, next, next) {
        virtio_scsi_handle_cmd_req_submit(s, req);
    }
+    return progress;
 }

 static void virtio_scsi_handle_cmd(VirtIODevice *vdev, VirtQueue *vq)
@@ -752,11 +758,13 @@ out:
    virtio_scsi_release(s);
 }

-void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
+bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
 {
    if (s->events_dropped) {
        virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
+        return true;
    }
+    return false;
 }

 static void virtio_scsi_handle_event(VirtIODevice *vdev, VirtQueue *vq)

--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -294,53 +294,78 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
           section->offset_within_address_space & (1ULL << 63);
 }

-static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+/* Called with rcu_read_lock held.  */
+static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
+                           bool *read_only)
 {
-    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
-    VFIOContainer *container = giommu->container;
-    hwaddr iova = iotlb->iova + giommu->iommu_offset;
    MemoryRegion *mr;
    hwaddr xlat;
    hwaddr len = iotlb->addr_mask + 1;
-    void *vaddr;
-    int ret;
-
-    trace_vfio_iommu_map_notify(iova, iova + iotlb->addr_mask);
-
-    if (iotlb->target_as != &address_space_memory) {
-        error_report("Wrong target AS \"%s\", only system memory is allowed",
-                     iotlb->target_as->name ? iotlb->target_as->name : "none");
-        return;
-    }
+    bool writable = iotlb->perm & IOMMU_WO;

    /*
     * The IOMMU TLB entry we have just covers translation through
     * this IOMMU to its immediate target.  We need to translate
     * it the rest of the way through to memory.
     */
-    rcu_read_lock();
    mr = address_space_translate(&address_space_memory,
                                 iotlb->translated_addr,
-                                 &xlat, &len, iotlb->perm & IOMMU_WO);
+                                 &xlat, &len, writable);
    if (!memory_region_is_ram(mr)) {
        error_report("iommu map to non memory area %"HWADDR_PRIx"",
                     xlat);
-        goto out;
+        return false;
    }
+
    /*
     * Translation truncates length to the IOMMU page size,
     * check that it did not truncate too much.
     */
    if (len & iotlb->addr_mask) {
        error_report("iommu has granularity incompatible with target AS");
-        goto out;
+        return false;
    }

+    *vaddr = memory_region_get_ram_ptr(mr) + xlat;
+    *read_only = !writable || mr->readonly;
+
+    return true;
+}
+
+static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+{
+    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
+    VFIOContainer *container = giommu->container;
+    hwaddr iova = iotlb->iova + giommu->iommu_offset;
+    bool read_only;
+    void *vaddr;
+    int ret;
+
+    trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
+                                iova, iova + iotlb->addr_mask);
+
+    if (iotlb->target_as != &address_space_memory) {
+        error_report("Wrong target AS \"%s\", only system memory is allowed",
+                     iotlb->target_as->name ? iotlb->target_as->name : "none");
+        return;
+    }
+
+    rcu_read_lock();
+
    if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
-        vaddr = memory_region_get_ram_ptr(mr) + xlat;
+        if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
+            goto out;
+        }
+        /*
+         * vaddr is only valid until rcu_read_unlock(). But after
+         * vfio_dma_map has set up the mapping the pages will be
+         * pinned by the kernel. This makes sure that the RAM backend
+         * of vaddr will always be there, even if the memory object is
+         * destroyed and its backing memory munmap-ed.
+         */
        ret = vfio_dma_map(container, iova,
                           iotlb->addr_mask + 1, vaddr,
-                           !(iotlb->perm & IOMMU_WO) || mr->readonly);
+                           read_only);
        if (ret) {
            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
                         "0x%"HWADDR_PRIx", %p) = %d (%m)",

--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -84,7 +84,7 @@ vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s"
 # hw/vfio/common.c
 vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
 vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
-vfio_iommu_map_notify(uint64_t iova_start, uint64_t iova_end) "iommu map @ %"PRIx64" - %"PRIx64
+vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ %"PRIx64" - %"PRIx64
 vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add %"PRIx64" - %"PRIx64
 vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] %"PRIx64" - %"PRIx64
 vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] %"PRIx64" - %"PRIx64" [%p]"

--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -1426,6 +1426,8 @@ struct MemoryRegionCache {
    bool is_write;
 };

+#define MEMORY_REGION_CACHE_INVALID ((MemoryRegionCache) { .mr = NULL })
+
 /* address_space_cache_init: prepare for repeated access to a physical
 * memory region
 *

--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -257,6 +257,8 @@ struct IntelIOMMUState {
    uint8_t womask[DMAR_REG_SIZE];  /* WO (write only - read returns 0) */
    uint32_t version;

+    bool caching_mode;          /* RO - is cap CM enabled? */
+
    dma_addr_t root;                /* Current root table pointer */
    bool root_extended;             /* Type of root table (extended or not) */
    bool dmar_enabled;              /* Set if DMA remapping is enabled */

--- a/include/hw/virtio/virtio-access.h
+++ b/include/hw/virtio/virtio-access.h
@@ -156,6 +156,58 @@ static inline uint16_t virtio_tswap16(VirtIODevice *vdev, uint16_t s)
 #endif
 }

+static inline uint16_t virtio_lduw_phys_cached(VirtIODevice *vdev,
+                                               MemoryRegionCache *cache,
+                                               hwaddr pa)
+{
+    if (virtio_access_is_big_endian(vdev)) {
+        return lduw_be_phys_cached(cache, pa);
+    }
+    return lduw_le_phys_cached(cache, pa);
+}
+
+static inline uint32_t virtio_ldl_phys_cached(VirtIODevice *vdev,
+                                              MemoryRegionCache *cache,
+                                              hwaddr pa)
+{
+    if (virtio_access_is_big_endian(vdev)) {
+        return ldl_be_phys_cached(cache, pa);
+    }
+    return ldl_le_phys_cached(cache, pa);
+}
+
+static inline uint64_t virtio_ldq_phys_cached(VirtIODevice *vdev,
+                                              MemoryRegionCache *cache,
+                                              hwaddr pa)
+{
+    if (virtio_access_is_big_endian(vdev)) {
+        return ldq_be_phys_cached(cache, pa);
+    }
+    return ldq_le_phys_cached(cache, pa);
+}
+
+static inline void virtio_stw_phys_cached(VirtIODevice *vdev,
+                                          MemoryRegionCache *cache,
+                                          hwaddr pa, uint16_t value)
+{
+    if (virtio_access_is_big_endian(vdev)) {
+        stw_be_phys_cached(cache, pa, value);
+    } else {
+        stw_le_phys_cached(cache, pa, value);
+    }
+}
+
+static inline void virtio_stl_phys_cached(VirtIODevice *vdev,
+                                          MemoryRegionCache *cache,
+                                          hwaddr pa, uint32_t value)
+{
+    if (virtio_access_is_big_endian(vdev)) {
+        stl_be_phys_cached(cache, pa, value);
+    } else {
+        stl_le_phys_cached(cache, pa, value);
+    }
+}
+
 static inline void virtio_tswap16s(VirtIODevice *vdev, uint16_t *s)
 {
    *s = virtio_tswap16(vdev, *s);

--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -80,6 +80,6 @@ typedef struct MultiReqBuffer {
    bool is_write;
 } MultiReqBuffer;

-void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq);
+bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq);

 #endif
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -126,9 +126,9 @@ void virtio_scsi_common_realize(DeviceState *dev, Error **errp,
                                VirtIOHandleOutput cmd);

 void virtio_scsi_common_unrealize(DeviceState *dev, Error **errp);
-void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq);
-void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq);
-void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq);
+bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq);
+bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq);
+bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq);
 void virtio_scsi_init_req(VirtIOSCSI *s, VirtQueue *vq, VirtIOSCSIReq *req);
 void virtio_scsi_free_req(VirtIOSCSIReq *req);
 void virtio_scsi_push_event(VirtIOSCSI *s, SCSIDevice *dev,

--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -85,6 +85,7 @@ struct VirtIODevice
    uint32_t generation;
    int nvectors;
    VirtQueue *vq;
+    MemoryListener listener;
    uint16_t device_id;
    bool vm_running;
    bool broken; /* device in invalid state, needs reset */
@@ -154,6 +155,7 @@ void virtio_error(VirtIODevice *vdev, const char *fmt, ...) GCC_FMT_ATTR(2, 3);
 void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name);

 typedef void (*VirtIOHandleOutput)(VirtIODevice *, VirtQueue *);
+typedef bool (*VirtIOHandleAIOOutput)(VirtIODevice *, VirtQueue *);

 VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
                            VirtIOHandleOutput handle_output);
@@ -284,8 +286,7 @@ bool virtio_device_ioeventfd_enabled(VirtIODevice *vdev);
 EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq);
 void virtio_queue_host_notifier_read(EventNotifier *n);
 void virtio_queue_aio_set_host_notifier_handler(VirtQueue *vq, AioContext *ctx,
-                                                void (*fn)(VirtIODevice *,
-                                                           VirtQueue *));
+                                                VirtIOHandleAIOOutput handle_output);
 VirtQueue *virtio_vector_first_queue(VirtIODevice *vdev, uint16_t vector);
 VirtQueue *virtio_vector_next_queue(VirtQueue *vq);


--- a/memory.c
+++ b/memory.c
@@ -2371,8 +2371,13 @@ void memory_listener_register(MemoryListener *listener, AddressSpace *as)

 void memory_listener_unregister(MemoryListener *listener)
 {
+    if (!listener->address_space) {
+        return;
+    }
+
    QTAILQ_REMOVE(&memory_listeners, listener, link);
    QTAILQ_REMOVE(&listener->address_space->listeners, listener, link_as);
+    listener->address_space = NULL;
 }

 void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)