Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging

Block patches for 2.1.0-rc2 (v2) # gpg: Signature made Mon 14 Jul 2014 11:04:12 BST using RSA key ID C88F2FD6 # gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>" * remotes/kevin/tags/for-upstream: (22 commits) ide: Treat read/write beyond end as invalid virtio-blk: Treat read/write beyond end as invalid virtio-blk: Bypass error action and I/O accounting on invalid r/w virtio-blk: Factor common checks out of virtio_blk_handle_read/write() dma-helpers: Fix too long qiov qtest: fix vhost-user-test compilation with old GLib tests: Fix unterminated string output visitor enum human string AioContext: do not rely on aio_poll(ctx, true) result to end a loop virtio-blk: embed VirtQueueElement in VirtIOBlockReq virtio-blk: avoid g_slice_new0() for VirtIOBlockReq and VirtQueueElement dataplane: do not free VirtQueueElement in vring_push() virtio-blk: avoid dataplane VirtIOBlockReq early free block: Assert qiov length matches request length qed: Make qiov match request size until backing file EOF qcow2: Make qiov match request size until backing file EOF block: Make qiov match the request size until EOF AioContext: speed up aio_notify test-aio: fix GSource-based timer test block: drop aio functions that operate on the main AioContext block: prefer aio_poll to qemu_aio_wait ... Signed-off-by: N Peter Maydell <peter.maydell@linaro.org>

Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging
Block patches for 2.1.0-rc2 (v2) # gpg: Signature made Mon 14 Jul 2014 11:04:12 BST using RSA key ID C88F2FD6 # gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>" * remotes/kevin/tags/for-upstream: (22 commits) ide: Treat read/write beyond end as invalid virtio-blk: Treat read/write beyond end as invalid virtio-blk: Bypass error action and I/O accounting on invalid r/w virtio-blk: Factor common checks out of virtio_blk_handle_read/write() dma-helpers: Fix too long qiov qtest: fix vhost-user-test compilation with old GLib tests: Fix unterminated string output visitor enum human string AioContext: do not rely on aio_poll(ctx, true) result to end a loop virtio-blk: embed VirtQueueElement in VirtIOBlockReq virtio-blk: avoid g_slice_new0() for VirtIOBlockReq and VirtQueueElement dataplane: do not free VirtQueueElement in vring_push() virtio-blk: avoid dataplane VirtIOBlockReq early free block: Assert qiov length matches request length qed: Make qiov match request size until backing file EOF qcow2: Make qiov match request size until backing file EOF block: Make qiov match the request size until EOF AioContext: speed up aio_notify test-aio: fix GSource-based timer test block: drop aio functions that operate on the main AioContext block: prefer aio_poll to qemu_aio_wait ... Signed-off-by: N Peter Maydell <peter.maydell@linaro.org>
7a6d04e7 · Peter Maydell · c15a34ed · 58ac3211 · 7a6d04e7 · 7a6d04e7
30 changed file
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -125,7 +125,7 @@ static bool aio_dispatch(AioContext *ctx)
    bool progress = false;

    /*
-     * We have to walk very carefully in case qemu_aio_set_fd_handler is
+     * We have to walk very carefully in case aio_set_fd_handler is
     * called while we're walking.
     */
    node = QLIST_FIRST(&ctx->aio_handlers);
@@ -175,27 +175,56 @@ static bool aio_dispatch(AioContext *ctx)
 bool aio_poll(AioContext *ctx, bool blocking)
 {
    AioHandler *node;
+    bool was_dispatching;
    int ret;
    bool progress;

+    was_dispatching = ctx->dispatching;
    progress = false;

+    /* aio_notify can avoid the expensive event_notifier_set if
+     * everything (file descriptors, bottom halves, timers) will
+     * be re-evaluated before the next blocking poll().  This happens
+     * in two cases:
+     *
+     * 1) when aio_poll is called with blocking == false
+     *
+     * 2) when we are called after poll().  If we are called before
+     *    poll(), bottom halves will not be re-evaluated and we need
+     *    aio_notify() if blocking == true.
+     *
+     * The first aio_dispatch() only does something when AioContext is
+     * running as a GSource, and in that case aio_poll is used only
+     * with blocking == false, so this optimization is already quite
+     * effective.  However, the code is ugly and should be restructured
+     * to have a single aio_dispatch() call.  To do this, we need to
+     * reorganize aio_poll into a prepare/poll/dispatch model like
+     * glib's.
+     *
+     * If we're in a nested event loop, ctx->dispatching might be true.
+     * In that case we can restore it just before returning, but we
+     * have to clear it now.
+     */
+    aio_set_dispatching(ctx, !blocking);
+
    /*
     * If there are callbacks left that have been queued, we need to call them.
     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for qemu_aio_wait loops).
+     * does not need a complete flush (as is the case for aio_poll loops).
     */
    if (aio_bh_poll(ctx)) {
        blocking = false;
        progress = true;
    }

+    /* Re-evaluate condition (1) above.  */
+    aio_set_dispatching(ctx, !blocking);
    if (aio_dispatch(ctx)) {
        progress = true;
    }

    if (progress && !blocking) {
-        return true;
+        goto out;
    }

    ctx->walking_handlers++;
@@ -234,9 +263,12 @@ bool aio_poll(AioContext *ctx, bool blocking)
    }

    /* Run dispatch even if there were no readable fds to run timers */
+    aio_set_dispatching(ctx, true);
    if (aio_dispatch(ctx)) {
        progress = true;
    }

+out:
+    aio_set_dispatching(ctx, was_dispatching);
    return progress;
 }
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -102,7 +102,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
    /*
     * If there are callbacks left that have been queued, we need to call then.
     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for qemu_aio_wait loops).
+     * does not need a complete flush (as is the case for aio_poll loops).
     */
    if (aio_bh_poll(ctx)) {
        blocking = false;
@@ -115,7 +115,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
    /*
     * Then dispatch any pending callbacks from the GSource.
     *
-     * We have to walk very carefully in case qemu_aio_set_fd_handler is
+     * We have to walk very carefully in case aio_set_fd_handler is
     * called while we're walking.
     */
    node = QLIST_FIRST(&ctx->aio_handlers);
@@ -177,7 +177,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
        blocking = false;

        /* we have to walk very carefully in case
-         * qemu_aio_set_fd_handler is called while we're walking */
+         * aio_set_fd_handler is called while we're walking */
        node = QLIST_FIRST(&ctx->aio_handlers);
        while (node) {
            AioHandler *tmp;

--- a/async.c
+++ b/async.c
@@ -26,6 +26,7 @@
 #include "block/aio.h"
 #include "block/thread-pool.h"
 #include "qemu/main-loop.h"
+#include "qemu/atomic.h"

 /***********************************************************/
 /* bottom halves (can be seen as timers which expire ASAP) */
@@ -247,9 +248,25 @@ ThreadPool *aio_get_thread_pool(AioContext *ctx)
    return ctx->thread_pool;
 }

+void aio_set_dispatching(AioContext *ctx, bool dispatching)
+{
+    ctx->dispatching = dispatching;
+    if (!dispatching) {
+        /* Write ctx->dispatching before reading e.g. bh->scheduled.
+         * Optimization: this is only needed when we're entering the "unsafe"
+         * phase where other threads must call event_notifier_set.
+         */
+        smp_mb();
+    }
+}
+
 void aio_notify(AioContext *ctx)
 {
-    event_notifier_set(&ctx->notifier);
+    /* Write e.g. bh->scheduled before reading ctx->dispatching.  */
+    smp_mb();
+    if (!ctx->dispatching) {
+        event_notifier_set(&ctx->notifier);
+    }
 }

 static void aio_timerlist_notify(void *opaque)

--- a/block.c
+++ b/block.c
@@ -471,7 +471,7 @@ int bdrv_create(BlockDriver *drv, const char* filename,
        co = qemu_coroutine_create(bdrv_create_co_entry);
        qemu_coroutine_enter(co, &cco);
        while (cco.ret == NOT_DONE) {
-            qemu_aio_wait();
+            aio_poll(qemu_get_aio_context(), true);
        }
    }

@@ -3010,6 +3010,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,

    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(!qiov || bytes == qiov->size);

    /* Handle Copy on Read and associated serialisation */
    if (flags & BDRV_REQ_COPY_ON_READ) {
@@ -3054,8 +3055,20 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
                                  align >> BDRV_SECTOR_BITS);
        if (max_nb_sectors > 0) {
-            ret = drv->bdrv_co_readv(bs, sector_num,
-                                     MIN(nb_sectors, max_nb_sectors), qiov);
+            QEMUIOVector local_qiov;
+            size_t local_sectors;
+
+            max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
+            local_sectors = MIN(max_nb_sectors, nb_sectors);
+
+            qemu_iovec_init(&local_qiov, qiov->niov);
+            qemu_iovec_concat(&local_qiov, qiov, 0,
+                              local_sectors * BDRV_SECTOR_SIZE);
+
+            ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
+                                     &local_qiov);
+
+            qemu_iovec_destroy(&local_qiov);
        } else {
            ret = 0;
        }
@@ -3267,6 +3280,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,

    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(!qiov || bytes == qiov->size);

    waited = wait_serialising_requests(req);
    assert(!waited || !req->serialising);
@@ -4040,7 +4054,7 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
    if (ret < 0) {
        return ret;
    }
-    return (ret & BDRV_BLOCK_ALLOCATED);
+    return !!(ret & BDRV_BLOCK_ALLOCATED);
 }

 /*

--- a/block/backup.c
+++ b/block/backup.c
@@ -307,7 +307,7 @@ static void coroutine_fn backup_run(void *opaque)
                                BACKUP_SECTORS_PER_CLUSTER - i, &n);
                    i += n;

-                    if (alloced == 1) {
+                    if (alloced == 1 || n == 0) {
                        break;
                    }
                }

--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1020,11 +1020,20 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
                    sector_num, cur_nr_sectors);
                if (n1 > 0) {
+                    QEMUIOVector local_qiov;
+
+                    qemu_iovec_init(&local_qiov, hd_qiov.niov);
+                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0,
+                                      n1 * BDRV_SECTOR_SIZE);
+
                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
                    qemu_co_mutex_unlock(&s->lock);
                    ret = bdrv_co_readv(bs->backing_hd, sector_num,
-                                        n1, &hd_qiov);
+                                        n1, &local_qiov);
                    qemu_co_mutex_lock(&s->lock);
+
+                    qemu_iovec_destroy(&local_qiov);
+
                    if (ret < 0) {
                        goto fail;
                    }

--- a/block/qed.c
+++ b/block/qed.c
@@ -761,17 +761,19 @@ static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
 /**
 * Read from the backing file or zero-fill if no backing file
 *
- * @s:          QED state
- * @pos:        Byte position in device
- * @qiov:       Destination I/O vector
- * @cb:         Completion function
- * @opaque:     User data for completion function
+ * @s:              QED state
+ * @pos:            Byte position in device
+ * @qiov:           Destination I/O vector
+ * @backing_qiov:   Possibly shortened copy of qiov, to be allocated here
+ * @cb:             Completion function
+ * @opaque:         User data for completion function
 *
 * This function reads qiov->size bytes starting at pos from the backing file.
 * If there is no backing file then zeroes are read.
 */
 static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
                                  QEMUIOVector *qiov,
+                                  QEMUIOVector **backing_qiov,
                                  BlockDriverCompletionFunc *cb, void *opaque)
 {
    uint64_t backing_length = 0;
@@ -804,15 +806,21 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
    /* If the read straddles the end of the backing file, shorten it */
    size = MIN((uint64_t)backing_length - pos, qiov->size);

+    assert(*backing_qiov == NULL);
+    *backing_qiov = g_new(QEMUIOVector, 1);
+    qemu_iovec_init(*backing_qiov, qiov->niov);
+    qemu_iovec_concat(*backing_qiov, qiov, 0, size);
+
    BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
    bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE,
-                   qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
+                   *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
 }

 typedef struct {
    GenericCB gencb;
    BDRVQEDState *s;
    QEMUIOVector qiov;
+    QEMUIOVector *backing_qiov;
    struct iovec iov;
    uint64_t offset;
 } CopyFromBackingFileCB;
@@ -829,6 +837,12 @@ static void qed_copy_from_backing_file_write(void *opaque, int ret)
    CopyFromBackingFileCB *copy_cb = opaque;
    BDRVQEDState *s = copy_cb->s;

+    if (copy_cb->backing_qiov) {
+        qemu_iovec_destroy(copy_cb->backing_qiov);
+        g_free(copy_cb->backing_qiov);
+        copy_cb->backing_qiov = NULL;
+    }
+
    if (ret) {
        qed_copy_from_backing_file_cb(copy_cb, ret);
        return;
@@ -866,11 +880,12 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
    copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque);
    copy_cb->s = s;
    copy_cb->offset = offset;
+    copy_cb->backing_qiov = NULL;
    copy_cb->iov.iov_base = qemu_blockalign(s->bs, len);
    copy_cb->iov.iov_len = len;
    qemu_iovec_init_external(&copy_cb->qiov, &copy_cb->iov, 1);

-    qed_read_backing_file(s, pos, &copy_cb->qiov,
+    qed_read_backing_file(s, pos, &copy_cb->qiov, &copy_cb->backing_qiov,
                          qed_copy_from_backing_file_write, copy_cb);
 }

@@ -1313,7 +1328,7 @@ static void qed_aio_read_data(void *opaque, int ret,
        return;
    } else if (ret != QED_CLUSTER_FOUND) {
        qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
-                              qed_aio_next_io, acb);
+                              &acb->backing_qiov, qed_aio_next_io, acb);
        return;
    }

@@ -1339,6 +1354,12 @@ static void qed_aio_next_io(void *opaque, int ret)

    trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);

+    if (acb->backing_qiov) {
+        qemu_iovec_destroy(acb->backing_qiov);
+        g_free(acb->backing_qiov);
+        acb->backing_qiov = NULL;
+    }
+
    /* Handle I/O error */
    if (ret) {
        qed_aio_complete(acb, ret);
@@ -1378,6 +1399,7 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
    acb->qiov_offset = 0;
    acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
    acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
+    acb->backing_qiov = NULL;
    acb->request.l2_table = NULL;
    qemu_iovec_init(&acb->cur_qiov, qiov->niov);


--- a/block/qed.h
+++ b/block/qed.h
@@ -142,6 +142,7 @@ typedef struct QEDAIOCB {

    /* Current cluster scatter-gather list */
    QEMUIOVector cur_qiov;
+    QEMUIOVector *backing_qiov;
    uint64_t cur_pos;               /* position on block device, in bytes */
    uint64_t cur_cluster;           /* cluster offset in image file */
    unsigned int cur_nclusters;     /* number of clusters being accessed */

--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -790,6 +790,7 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
            p += aiocb->aio_iov[i].iov_len;
        }
+        assert(p - buf == aiocb->aio_nbytes);
    }

    nbytes = handle_aiocb_rw_linear(aiocb, buf);
@@ -804,9 +805,11 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
                copy = aiocb->aio_iov[i].iov_len;
            }
            memcpy(aiocb->aio_iov[i].iov_base, p, copy);
+            assert(count >= copy);
            p     += copy;
            count -= copy;
        }
+        assert(count == 0);
    }
    qemu_vfree(buf);

@@ -993,12 +996,14 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
    acb->aio_type = type;
    acb->aio_fildes = fd;

+    acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
+    acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;
+
    if (qiov) {
        acb->aio_iov = qiov->iov;
        acb->aio_niov = qiov->niov;
+        assert(qiov->size == acb->aio_nbytes);
    }
-    acb->aio_nbytes = nb_sectors * 512;
-    acb->aio_offset = sector_num * 512;

    trace_paio_submit_co(sector_num, nb_sectors, type);
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
@@ -1016,12 +1021,14 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
    acb->aio_type = type;
    acb->aio_fildes = fd;

+    acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
+    acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;
+
    if (qiov) {
        acb->aio_iov = qiov->iov;
        acb->aio_niov = qiov->niov;
+        assert(qiov->size == acb->aio_nbytes);
    }
-    acb->aio_nbytes = nb_sectors * 512;
-    acb->aio_offset = sector_num * 512;

    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));

--- a/blockjob.c
+++ b/blockjob.c
@@ -187,7 +187,7 @@ int block_job_cancel_sync(BlockJob *job)
    job->opaque = &data;
    block_job_cancel(job);
    while (data.ret == -EINPROGRESS) {
-        qemu_aio_wait();
+        aio_poll(bdrv_get_aio_context(bs), true);
    }
    return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
 }

--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -170,6 +170,10 @@ static void dma_bdrv_cb(void *opaque, int ret)
        return;
    }

+    if (dbs->iov.size & ~BDRV_SECTOR_MASK) {
+        qemu_iovec_discard_back(&dbs->iov, dbs->iov.size & ~BDRV_SECTOR_MASK);
+    }
+
    dbs->acb = dbs->io_func(dbs->bs, dbs->sector_num, &dbs->iov,
                            dbs->iov.size / 512, dma_bdrv_cb, dbs);
    assert(dbs->acb);

--- a/docs/aio_notify.promela
+++ b/docs/aio_notify.promela
+/*
+ * This model describes the interaction between aio_set_dispatching()
+ * and aio_notify().
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This file is in the public domain.  If you really want a license,
+ * the WTFPL will do.
+ *
+ * To simulate it:
+ *     spin -p docs/aio_notify.promela
+ *
+ * To verify it:
+ *     spin -a docs/aio_notify.promela
+ *     gcc -O2 pan.c
+ *     ./a.out -a
+ */
+
+#define MAX   4
+#define LAST  (1 << (MAX - 1))
+#define FINAL ((LAST << 1) - 1)
+
+bool dispatching;
+bool event;
+
+int req, done;
+
+active proctype waiter()
+{
+     int fetch, blocking;
+
+     do
+        :: done != FINAL -> {
+            // Computing "blocking" is separate from execution of the
+            // "bottom half"
+            blocking = (req == 0);
+
+            // This is our "bottom half"
+            atomic { fetch = req; req = 0; }
+            done = done | fetch;
+
+            // Wait for a nudge from the other side
+            do
+                :: event == 1 -> { event = 0; break; }
+                :: !blocking  -> break;
+            od;
+
+            dispatching = 1;
+
+            // If you are simulating this model, you may want to add
+            // something like this here:
+            //
+            //      int foo; foo++; foo++; foo++;
+            //
+            // This only wastes some time and makes it more likely
+            // that the notifier process hits the "fast path".
+
+            dispatching = 0;
+        }
+        :: else -> break;
+    od
+}
+
+active proctype notifier()
+{
+    int next = 1;
+    int sets = 0;
+
+    do
+        :: next <= LAST -> {
+            // generate a request
+            req = req | next;
+            next = next << 1;
+
+            // aio_notify
+            if
+                :: dispatching == 0 -> sets++; event = 1;
+                :: else             -> skip;
+            fi;
+
+            // Test both synchronous and asynchronous delivery
+            if
+                :: 1 -> do
+                            :: req == 0 -> break;
+                        od;
+                :: 1 -> skip;
+            fi;
+        }
+        :: else -> break;
+    od;
+    printf("Skipped %d event_notifier_set\n", MAX - sets);
+}
+
+#define p (done == FINAL)
+
+never  {
+    do
+        :: 1                      // after an arbitrarily long prefix
+        :: p -> break             // p becomes true
+    od;
+    do
+        :: !p -> accept: break    // it then must remains true forever after
+    od
+}
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -65,43 +65,41 @@ static void complete_request_vring(VirtIOBlockReq *req, unsigned char status)
 {
    stb_p(&req->in->status, status);

-    vring_push(&req->dev->dataplane->vring, req->elem,
+    vring_push(&req->dev->dataplane->vring, &req->elem,
               req->qiov.size + sizeof(*req->in));
    notify_guest(req->dev->dataplane);
-    g_slice_free(VirtIOBlockReq, req);
 }

 static void handle_notify(EventNotifier *e)
 {
    VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane,
                                           host_notifier);
-
-    VirtQueueElement *elem;
-    VirtIOBlockReq *req;
-    int ret;
-    MultiReqBuffer mrb = {
-        .num_writes = 0,
-    };
+    VirtIOBlock *vblk = VIRTIO_BLK(s->vdev);

    event_notifier_test_and_clear(&s->host_notifier);
    bdrv_io_plug(s->blk->conf.bs);
    for (;;) {
+        MultiReqBuffer mrb = {
+            .num_writes = 0,
+        };
+        int ret;
+
        /* Disable guest->host notifies to avoid unnecessary vmexits */
        vring_disable_notification(s->vdev, &s->vring);

        for (;;) {
-            ret = vring_pop(s->vdev, &s->vring, &elem);
+            VirtIOBlockReq *req = virtio_blk_alloc_request(vblk);
+
+            ret = vring_pop(s->vdev, &s->vring, &req->elem);
            if (ret < 0) {
-                assert(elem == NULL);
+                virtio_blk_free_request(req);
                break; /* no more requests */
            }

-            trace_virtio_blk_data_plane_process_request(s, elem->out_num,
-                                                        elem->in_num, elem->index);
+            trace_virtio_blk_data_plane_process_request(s, req->elem.out_num,
+                                                        req->elem.in_num,
+                                                        req->elem.index);

-            req = g_slice_new(VirtIOBlockReq);
-            req->dev = VIRTIO_BLK(s->vdev);
-            req->elem = elem;
            virtio_blk_handle_request(req, &mrb);
        }


--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -29,18 +29,18 @@
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio-access.h"

-static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
+VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
 {
-    VirtIOBlockReq *req = g_slice_new0(VirtIOBlockReq);
+    VirtIOBlockReq *req = g_slice_new(VirtIOBlockReq);
    req->dev = s;
-    req->elem = g_slice_new0(VirtQueueElement);
+    req->qiov.size = 0;
+    req->next = NULL;
    return req;
 }

-static void virtio_blk_free_request(VirtIOBlockReq *req)
+void virtio_blk_free_request(VirtIOBlockReq *req)
 {
    if (req) {
-        g_slice_free(VirtQueueElement, req->elem);
        g_slice_free(VirtIOBlockReq, req);
    }
 }
@@ -54,7 +54,7 @@ static void virtio_blk_complete_request(VirtIOBlockReq *req,
    trace_virtio_blk_req_complete(req, status);

    stb_p(&req->in->status, status);
-    virtqueue_push(s->vq, req->elem, req->qiov.size + sizeof(*req->in));
+    virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
    virtio_notify(vdev, s->vq);
 }

@@ -119,7 +119,7 @@ static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s)
 {
    VirtIOBlockReq *req = virtio_blk_alloc_request(s);

-    if (!virtqueue_pop(s->vq, req->elem)) {
+    if (!virtqueue_pop(s->vq, &req->elem)) {
        virtio_blk_free_request(req);
        return NULL;
    }
@@ -252,7 +252,7 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
 {
    int status;

-    status = virtio_blk_handle_scsi_req(req->dev, req->elem);
+    status = virtio_blk_handle_scsi_req(req->dev, &req->elem);
    virtio_blk_req_complete(req, status);
    virtio_blk_free_request(req);
 }
@@ -288,6 +288,25 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
    bdrv_aio_flush(req->dev->bs, virtio_blk_flush_complete, req);
 }

+static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
+                                     uint64_t sector, size_t size)
+{
+    uint64_t nb_sectors = size >> BDRV_SECTOR_BITS;
+    uint64_t total_sectors;
+
+    if (sector & dev->sector_mask) {
+        return false;
+    }
+    if (size % dev->conf->logical_block_size) {
+        return false;
+    }
+    bdrv_get_geometry(dev->bs, &total_sectors);
+    if (sector > total_sectors || nb_sectors > total_sectors - sector) {
+        return false;
+    }
+    return true;
+}
+
 static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 {
    BlockRequest *blkreq;
@@ -295,19 +314,16 @@ static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)

    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);

-    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE);
-
    trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);

-    if (sector & req->dev->sector_mask) {
-        virtio_blk_rw_complete(req, -EIO);
-        return;
-    }
-    if (req->qiov.size % req->dev->conf->logical_block_size) {
-        virtio_blk_rw_complete(req, -EIO);
+    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
+        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+        virtio_blk_free_request(req);
        return;
    }

+    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE);
+
    if (mrb->num_writes == 32) {
        virtio_submit_multiwrite(req->dev->bs, mrb);
    }
@@ -329,18 +345,15 @@ static void virtio_blk_handle_read(VirtIOBlockReq *req)

    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);

-    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ);
-
    trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512);

-    if (sector & req->dev->sector_mask) {
-        virtio_blk_rw_complete(req, -EIO);
-        return;
-    }
-    if (req->qiov.size % req->dev->conf->logical_block_size) {
-        virtio_blk_rw_complete(req, -EIO);
+    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
+        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+        virtio_blk_free_request(req);
        return;
    }
+
+    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ);
    bdrv_aio_readv(req->dev->bs, sector, &req->qiov,
                   req->qiov.size / BDRV_SECTOR_SIZE,
                   virtio_blk_rw_complete, req);
@@ -349,12 +362,12 @@ static void virtio_blk_handle_read(VirtIOBlockReq *req)
 void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 {
    uint32_t type;
-    struct iovec *in_iov = req->elem->in_sg;
-    struct iovec *iov = req->elem->out_sg;
-    unsigned in_num = req->elem->in_num;
-    unsigned out_num = req->elem->out_num;
+    struct iovec *in_iov = req->elem.in_sg;
+    struct iovec *iov = req->elem.out_sg;
+    unsigned in_num = req->elem.in_num;
+    unsigned out_num = req->elem.out_num;

-    if (req->elem->out_num < 1 || req->elem->in_num < 1) {
+    if (req->elem.out_num < 1 || req->elem.in_num < 1) {
        error_report("virtio-blk missing headers");
        exit(1);
    }
@@ -391,19 +404,19 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         * NB: per existing s/n string convention the string is
         * terminated by '\0' only when shorter than buffer.
         */
-        strncpy(req->elem->in_sg[0].iov_base,
+        strncpy(req->elem.in_sg[0].iov_base,
                s->blk.serial ? s->blk.serial : "",
-                MIN(req->elem->in_sg[0].iov_len, VIRTIO_BLK_ID_BYTES));
+                MIN(req->elem.in_sg[0].iov_len, VIRTIO_BLK_ID_BYTES));
        virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
        virtio_blk_free_request(req);
    } else if (type & VIRTIO_BLK_T_OUT) {
-        qemu_iovec_init_external(&req->qiov, &req->elem->out_sg[1],
-                                 req->elem->out_num - 1);
+        qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1],
+                                 req->elem.out_num - 1);
        virtio_blk_handle_write(req, mrb);
    } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
        /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */
-        qemu_iovec_init_external(&req->qiov, &req->elem->in_sg[0],
-                                 req->elem->in_num - 1);
+        qemu_iovec_init_external(&req->qiov, &req->elem.in_sg[0],
+                                 req->elem.in_num - 1);
        virtio_blk_handle_read(req);
    } else {
        virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
@@ -627,7 +640,7 @@ static void virtio_blk_save_device(VirtIODevice *vdev, QEMUFile *f)

    while (req) {
        qemu_put_sbyte(f, 1);
-        qemu_put_buffer(f, (unsigned char *)req->elem,
+        qemu_put_buffer(f, (unsigned char *)&req->elem,
                        sizeof(VirtQueueElement));
        req = req->next;
    }
@@ -652,15 +665,15 @@ static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f,

    while (qemu_get_sbyte(f)) {
        VirtIOBlockReq *req = virtio_blk_alloc_request(s);
-        qemu_get_buffer(f, (unsigned char *)req->elem,
+        qemu_get_buffer(f, (unsigned char *)&req->elem,
                        sizeof(VirtQueueElement));
        req->next = s->rq;
        s->rq = req;

-        virtqueue_map_sg(req->elem->in_sg, req->elem->in_addr,
-            req->elem->in_num, 1);
-        virtqueue_map_sg(req->elem->out_sg, req->elem->out_addr,
-            req->elem->out_num, 0);
+        virtqueue_map_sg(req->elem.in_sg, req->elem.in_addr,
+            req->elem.in_num, 1);
+        virtqueue_map_sg(req->elem.out_sg, req->elem.out_addr,
+            req->elem.out_num, 0);
    }

    return 0;

--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -499,6 +499,18 @@ static void ide_rw_error(IDEState *s) {
    ide_set_irq(s->bus);
 }

+static bool ide_sect_range_ok(IDEState *s,
+                              uint64_t sector, uint64_t nb_sectors)
+{
+    uint64_t total_sectors;
+
+    bdrv_get_geometry(s->bs, &total_sectors);
+    if (sector > total_sectors || nb_sectors > total_sectors - sector) {
+        return false;
+    }
+    return true;
+}
+
 static void ide_sector_read_cb(void *opaque, int ret)
 {
    IDEState *s = opaque;
@@ -554,6 +566,11 @@ void ide_sector_read(IDEState *s)
    printf("sector=%" PRId64 "\n", sector_num);
 #endif

+    if (!ide_sect_range_ok(s, sector_num, n)) {
+        ide_rw_error(s);
+        return;
+    }
+
    s->iov.iov_base = s->io_buffer;
    s->iov.iov_len  = n * BDRV_SECTOR_SIZE;
    qemu_iovec_init_external(&s->qiov, &s->iov, 1);
@@ -671,6 +688,12 @@ void ide_dma_cb(void *opaque, int ret)
           sector_num, n, s->dma_cmd);
 #endif

+    if (!ide_sect_range_ok(s, sector_num, n)) {
+        dma_buf_commit(s);
+        ide_dma_error(s);
+        return;
+    }
+
    switch (s->dma_cmd) {
    case IDE_DMA_READ:
        s->bus->dma->aiocb = dma_bdrv_read(s->bs, &s->sg, sector_num,
@@ -790,6 +813,11 @@ void ide_sector_write(IDEState *s)
        n = s->req_nb_sectors;
    }

+    if (!ide_sect_range_ok(s, sector_num, n)) {
+        ide_rw_error(s);
+        return;
+    }
+
    s->iov.iov_base = s->io_buffer;
    s->iov.iov_len  = n * BDRV_SECTOR_SIZE;
    qemu_iovec_init_external(&s->qiov, &s->iov, 1);

--- a/hw/virtio/dataplane/vring.c
+++ b/hw/virtio/dataplane/vring.c
@@ -272,7 +272,7 @@ static int get_indirect(Vring *vring, VirtQueueElement *elem,
    return 0;
 }

-void vring_free_element(VirtQueueElement *elem)
+static void vring_unmap_element(VirtQueueElement *elem)
 {
    int i;

@@ -287,8 +287,6 @@ void vring_free_element(VirtQueueElement *elem)
    for (i = 0; i < elem->in_num; i++) {
        vring_unmap(elem->in_sg[i].iov_base, true);
    }
-
-    g_slice_free(VirtQueueElement, elem);
 }

 /* This looks in the virtqueue and for the first available buffer, and converts
@@ -303,14 +301,16 @@ void vring_free_element(VirtQueueElement *elem)
 * Stolen from linux/drivers/vhost/vhost.c.
 */
 int vring_pop(VirtIODevice *vdev, Vring *vring,
-              VirtQueueElement **p_elem)
+              VirtQueueElement *elem)
 {
    struct vring_desc desc;
    unsigned int i, head, found = 0, num = vring->vr.num;
    uint16_t avail_idx, last_avail_idx;
-    VirtQueueElement *elem = NULL;
    int ret;

+    /* Initialize elem so it can be safely unmapped */
+    elem->in_num = elem->out_num = 0;
+
    /* If there was a fatal error then refuse operation */
    if (vring->broken) {
        ret = -EFAULT;
@@ -342,10 +342,8 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
     * the index we've seen. */
    head = vring->vr.avail->ring[last_avail_idx % num];

-    elem = g_slice_new(VirtQueueElement);
    elem->index = head;
-    elem->in_num = elem->out_num = 0;
-    
+
    /* If their number is silly, that's an error. */
    if (unlikely(head >= num)) {
        error_report("Guest says index %u > %u is available", head, num);
@@ -393,7 +391,6 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,

    /* On success, increment avail index. */
    vring->last_avail_idx++;
-    *p_elem = elem;
    return head;

 out:
@@ -401,10 +398,7 @@ out:
    if (ret == -EFAULT) {
        vring->broken = true;
    }
-    if (elem) {
-        vring_free_element(elem);
-    }
-    *p_elem = NULL;
+    vring_unmap_element(elem);
    return ret;
 }

@@ -418,7 +412,7 @@ void vring_push(Vring *vring, VirtQueueElement *elem, int len)
    unsigned int head = elem->index;
    uint16_t new;

-    vring_free_element(elem);
+    vring_unmap_element(elem);

    /* Don't touch vring if a fatal error occurred */
    if (vring->broken) {

--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -60,8 +60,14 @@ struct AioContext {
     */
    int walking_handlers;

+    /* Used to avoid unnecessary event_notifier_set calls in aio_notify.
+     * Writes protected by lock or BQL, reads are lockless.
+     */
+    bool dispatching;
+
    /* lock to protect between bh's adders and deleter */
    QemuMutex bh_lock;
+
    /* Anchor of the list of Bottom Halves belonging to the context */
    struct QEMUBH *first_bh;

@@ -83,6 +89,9 @@ struct AioContext {
    QEMUTimerListGroup tlg;
 };

+/* Used internally to synchronize aio_poll against qemu_bh_schedule.  */
+void aio_set_dispatching(AioContext *ctx, bool dispatching);
+
 /**
 * aio_context_new: Allocate a new AioContext.
 *
@@ -205,9 +214,9 @@ bool aio_pending(AioContext *ctx);
 /* Progress in completing AIO work to occur.  This can issue new pending
 * aio as a result of executing I/O completion or bh callbacks.
 *
- * If there is no pending AIO operation or completion (bottom half),
- * return false.  If there are pending AIO operations of bottom halves,
- * return true.
+ * Return whether any progress was made by executing AIO or bottom half
+ * handlers.  If @blocking == true, this should always be true except
+ * if someone called aio_notify.
 *
 * If there are no pending bottom halves, but there are pending AIO
 * operations, it may not be possible to make any progress without
@@ -220,7 +229,7 @@ bool aio_poll(AioContext *ctx, bool blocking);
 #ifdef CONFIG_POSIX
 /* Register a file descriptor and associated callbacks.  Behaves very similarly
 * to qemu_set_fd_handler2.  Unlike qemu_set_fd_handler2, these callbacks will
- * be invoked when using qemu_aio_wait().
+ * be invoked when using aio_poll().
 *
 * Code that invokes AIO completion functions should rely on this function
 * instead of qemu_set_fd_handler[2].
@@ -234,7 +243,7 @@ void aio_set_fd_handler(AioContext *ctx,

 /* Register an event notifier and associated callbacks.  Behaves very similarly
 * to event_notifier_set_handler.  Unlike event_notifier_set_handler, these callbacks
- * will be invoked when using qemu_aio_wait().
+ * will be invoked when using aio_poll().
 *
 * Code that invokes AIO completion functions should rely on this function
 * instead of event_notifier_set_handler.
@@ -251,19 +260,6 @@ GSource *aio_get_g_source(AioContext *ctx);
 /* Return the ThreadPool bound to this AioContext */
 struct ThreadPool *aio_get_thread_pool(AioContext *ctx);

-/* Functions to operate on the main QEMU AioContext.  */
-
-bool qemu_aio_wait(void);
-void qemu_aio_set_event_notifier(EventNotifier *notifier,
-                                 EventNotifierHandler *io_read);
-
-#ifdef CONFIG_POSIX
-void qemu_aio_set_fd_handler(int fd,
-                             IOHandler *io_read,
-                             IOHandler *io_write,
-                             void *opaque);
-#endif
-
 /**
 * aio_timer_new:
 * @ctx: the aio context

--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -74,7 +74,7 @@ struct BlockJob {
     * Set to true if the job should cancel itself.  The flag must
     * always be tested just before toggling the busy flag from false
     * to true.  After a job has been cancelled, it should only yield
-     * if #qemu_aio_wait will ("sooner or later") reenter the coroutine.
+     * if #aio_poll will ("sooner or later") reenter the coroutine.
     */
    bool cancelled;

@@ -87,7 +87,7 @@ struct BlockJob {
    /**
     * Set to false by the job while it is in a quiescent state, where
     * no I/O is pending and the job has yielded on any condition
-     * that is not detected by #qemu_aio_wait, such as a timer.
+     * that is not detected by #aio_poll, such as a timer.
     */
    bool busy;


--- a/include/block/coroutine.h
+++ b/include/block/coroutine.h
@@ -212,7 +212,7 @@ void coroutine_fn co_sleep_ns(QEMUClockType type, int64_t ns);
 * Yield the coroutine for a given duration
 *
 * Behaves similarly to co_sleep_ns(), but the sleeping coroutine will be
- * resumed when using qemu_aio_wait().
+ * resumed when using aio_poll().
 */
 void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
                                  int64_t ns);

--- a/include/hw/virtio/dataplane/vring.h
+++ b/include/hw/virtio/dataplane/vring.h
@@ -53,8 +53,7 @@ void vring_teardown(Vring *vring, VirtIODevice *vdev, int n);
 void vring_disable_notification(VirtIODevice *vdev, Vring *vring);
 bool vring_enable_notification(VirtIODevice *vdev, Vring *vring);
 bool vring_should_notify(VirtIODevice *vdev, Vring *vring);
-int vring_pop(VirtIODevice *vdev, Vring *vring, VirtQueueElement **elem);
+int vring_pop(VirtIODevice *vdev, Vring *vring, VirtQueueElement *elem);
 void vring_push(Vring *vring, VirtQueueElement *elem, int len);
-void vring_free_element(VirtQueueElement *elem);

 #endif /* VRING_H */
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -144,7 +144,7 @@ typedef struct MultiReqBuffer {

 typedef struct VirtIOBlockReq {
    VirtIOBlock *dev;
-    VirtQueueElement *elem;
+    VirtQueueElement elem;
    struct virtio_blk_inhdr *in;
    struct virtio_blk_outhdr out;
    QEMUIOVector qiov;
@@ -152,6 +152,10 @@ typedef struct VirtIOBlockReq {
    BlockAcctCookie acct;
 } VirtIOBlockReq;

+VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s);
+
+void virtio_blk_free_request(VirtIOBlockReq *req);
+
 int virtio_blk_handle_scsi_req(VirtIOBlock *blk,
                               VirtQueueElement *elem);


--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -329,6 +329,7 @@ size_t qemu_iovec_memset(QEMUIOVector *qiov, size_t offset,
                         int fillc, size_t bytes);
 ssize_t qemu_iovec_compare(QEMUIOVector *a, QEMUIOVector *b);
 void qemu_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src, void *buf);
+void qemu_iovec_discard_back(QEMUIOVector *qiov, size_t bytes);

 bool buffer_is_zero(const void *buf, size_t len);


--- a/iothread.c
+++ b/iothread.c
@@ -30,6 +30,7 @@ typedef ObjectClass IOThreadClass;
 static void *iothread_run(void *opaque)
 {
    IOThread *iothread = opaque;
+    bool blocking;

    qemu_mutex_lock(&iothread->init_done_lock);
    iothread->thread_id = qemu_get_thread_id();
@@ -38,8 +39,10 @@ static void *iothread_run(void *opaque)

    while (!iothread->stopping) {
        aio_context_acquire(iothread->ctx);
-        while (!iothread->stopping && aio_poll(iothread->ctx, true)) {
+        blocking = true;
+        while (!iothread->stopping && aio_poll(iothread->ctx, blocking)) {
            /* Progress was made, keep going */
+            blocking = false;
        }
        aio_context_release(iothread->ctx);
    }

--- a/main-loop.c
+++ b/main-loop.c
@@ -498,24 +498,3 @@ QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque)
 {
    return aio_bh_new(qemu_aio_context, cb, opaque);
 }
-
-bool qemu_aio_wait(void)
-{
-    return aio_poll(qemu_aio_context, true);
-}
-
-#ifdef CONFIG_POSIX
-void qemu_aio_set_fd_handler(int fd,
-                             IOHandler *io_read,
-                             IOHandler *io_write,
-                             void *opaque)
-{
-    aio_set_fd_handler(qemu_aio_context, fd, io_read, io_write, opaque);
-}
-#endif
-
-void qemu_aio_set_event_notifier(EventNotifier *notifier,
-                                 EventNotifierHandler *io_read)
-{
-    aio_set_event_notifier(qemu_aio_context, notifier, io_read);
-}
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -483,7 +483,7 @@ static int do_co_write_zeroes(BlockDriverState *bs, int64_t offset, int count,
    co = qemu_coroutine_create(co_write_zeroes_entry);
    qemu_coroutine_enter(co, &data);
    while (!data.done) {
-        qemu_aio_wait();
+        aio_poll(bdrv_get_aio_context(bs), true);
    }
    if (data.ret < 0) {
        return data.ret;
@@ -2027,7 +2027,7 @@ static const cmdinfo_t resume_cmd = {
 static int wait_break_f(BlockDriverState *bs, int argc, char **argv)
 {
    while (!bdrv_debug_is_suspended(bs, argv[1])) {
-        qemu_aio_wait();
+        aio_poll(bdrv_get_aio_context(bs), true);
    }

    return 0;

--- a/tests/qemu-iotests/028
+++ b/tests/qemu-iotests/028
@@ -33,7 +33,8 @@ status=1	# failure is the default!

 _cleanup()
 {
-	_cleanup_test_img
+    rm -f "${TEST_IMG}.copy"
+    _cleanup_test_img
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15

@@ -41,6 +42,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.rc
 . ./common.filter
 . ./common.pattern
+. ./common.qemu

 # Any format supporting backing files except vmdk and qcow which do not support
 # smaller backing files.
@@ -99,6 +101,29 @@ _check_test_img
 # Rebase it on top of its base image
 $QEMU_IMG rebase -b "$TEST_IMG.base" "$TEST_IMG"

+echo
+echo block-backup
+echo
+
+qemu_comm_method="monitor"
+_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},id=disk
+h=$QEMU_HANDLE
+QEMU_COMM_TIMEOUT=1
+
+_send_qemu_cmd $h "drive_backup disk ${TEST_IMG}.copy" "(qemu)"
+qemu_cmd_repeat=20 _send_qemu_cmd $h "info block-jobs" "No active jobs"
+_send_qemu_cmd $h 'quit' ""
+
+# Base image sectors
+TEST_IMG="${TEST_IMG}.copy" io readv $(( offset )) 512 1024 32
+
+# Image sectors
+TEST_IMG="${TEST_IMG}.copy" io readv $(( offset + 512 )) 512 1024 64
+
+# Zero sectors beyond end of base image
+TEST_IMG="${TEST_IMG}.copy" io_zero readv $(( offset + 32 * 1024 )) 512 1024 32
+
+
 _check_test_img

 # success, all done

--- a/tests/qemu-iotests/028.out
+++ b/tests/qemu-iotests/028.out
--- a/tests/test-aio.c
+++ b/tests/test-aio.c
@@ -24,14 +24,6 @@ typedef struct {
    bool auto_set;
 } EventNotifierTestData;

-/* Wait until there are no more BHs or AIO requests */
-static void wait_for_aio(void)
-{
-    while (aio_poll(ctx, true)) {
-        /* Do nothing */
-    }
-}
-
 /* Wait until event notifier becomes inactive */
 static void wait_until_inactive(EventNotifierTestData *data)
 {
@@ -204,7 +196,9 @@ static void test_bh_schedule10(void)
    g_assert(aio_poll(ctx, true));
    g_assert_cmpint(data.n, ==, 2);

-    wait_for_aio();
+    while (data.n < 10) {
+        aio_poll(ctx, true);
+    }
    g_assert_cmpint(data.n, ==, 10);

    g_assert(!aio_poll(ctx, false));
@@ -252,7 +246,9 @@ static void test_bh_delete_from_cb(void)
    qemu_bh_schedule(data1.bh);
    g_assert_cmpint(data1.n, ==, 0);

-    wait_for_aio();
+    while (data1.n < data1.max) {
+        aio_poll(ctx, true);
+    }
    g_assert_cmpint(data1.n, ==, data1.max);
    g_assert(data1.bh == NULL);

@@ -287,7 +283,12 @@ static void test_bh_delete_from_cb_many(void)
    g_assert_cmpint(data4.n, ==, 1);
    g_assert(data1.bh == NULL);

-    wait_for_aio();
+    while (data1.n < data1.max ||
+           data2.n < data2.max ||
+           data3.n < data3.max ||
+           data4.n < data4.max) {
+        aio_poll(ctx, true);
+    }
    g_assert_cmpint(data1.n, ==, data1.max);
    g_assert_cmpint(data2.n, ==, data2.max);
    g_assert_cmpint(data3.n, ==, data3.max);
@@ -306,7 +307,7 @@ static void test_bh_flush(void)
    qemu_bh_schedule(data.bh);
    g_assert_cmpint(data.n, ==, 0);

-    wait_for_aio();
+    g_assert(aio_poll(ctx, true));
    g_assert_cmpint(data.n, ==, 1);

    g_assert(!aio_poll(ctx, false));
@@ -806,17 +807,16 @@ static void test_source_timer_schedule(void)
    g_usleep(1 * G_USEC_PER_SEC);
    g_assert_cmpint(data.n, ==, 0);

-    g_assert(g_main_context_iteration(NULL, false));
+    g_assert(g_main_context_iteration(NULL, true));
    g_assert_cmpint(data.n, ==, 1);
+    expiry += data.ns;

-    /* The comment above was not kidding when it said this wakes up itself */
-    do {
-        g_assert(g_main_context_iteration(NULL, true));
-    } while (qemu_clock_get_ns(data.clock_type) <= expiry);
-    g_usleep(1 * G_USEC_PER_SEC);
-    g_main_context_iteration(NULL, false);
+    while (data.n < 2) {
+        g_main_context_iteration(NULL, true);
+    }

    g_assert_cmpint(data.n, ==, 2);
+    g_assert(qemu_clock_get_ns(data.clock_type) > expiry);

    aio_set_fd_handler(ctx, pipefd[0], NULL, NULL, NULL);
    close(pipefd[0]);

--- a/tests/test-thread-pool.c
+++ b/tests/test-thread-pool.c
@@ -83,7 +83,7 @@ static void co_test_cb(void *opaque)
    data->ret = 0;
    active--;

-    /* The test continues in test_submit_co, after qemu_aio_wait_all... */
+    /* The test continues in test_submit_co, after aio_poll... */
 }

 static void test_submit_co(void)
@@ -98,7 +98,7 @@ static void test_submit_co(void)
    g_assert_cmpint(active, ==, 1);
    g_assert_cmpint(data.ret, ==, -EINPROGRESS);

-    /* qemu_aio_wait_all will execute the rest of the coroutine.  */
+    /* aio_poll will execute the rest of the coroutine.  */

    while (data.ret == -EINPROGRESS) {
        aio_poll(ctx, true);

--- a/util/iov.c
+++ b/util/iov.c
@@ -550,3 +550,16 @@ size_t iov_discard_back(struct iovec *iov, unsigned int *iov_cnt,

    return total;
 }
+
+void qemu_iovec_discard_back(QEMUIOVector *qiov, size_t bytes)
+{
+    size_t total;
+    unsigned int niov = qiov->niov;
+
+    assert(qiov->size >= bytes);
+    total = iov_discard_back(qiov->iov, &niov, bytes);
+    assert(total == bytes);
+
+    qiov->niov = niov;
+    qiov->size -= bytes;
+}