io.c 92.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
/*
 * Block layer I/O functions
 *
 * Copyright (c) 2003 Fabrice Bellard
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

P
Peter Maydell 已提交
25
#include "qemu/osdep.h"
26
#include "trace.h"
27
#include "sysemu/block-backend.h"
28
#include "block/aio-wait.h"
29
#include "block/blockjob.h"
30
#include "block/blockjob_int.h"
31
#include "block/block_int.h"
32
#include "qemu/cutils.h"
33
#include "qapi/error.h"
34
#include "qemu/error-report.h"
35 36 37

#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */

E
Eric Blake 已提交
38 39 40
/* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)

41 42
static AioWait drain_all_aio_wait;

E
Eric Blake 已提交
43
static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
44
    int64_t offset, int bytes, BdrvRequestFlags flags);
45

46 47
void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
                               bool ignore_bds_parents)
48
{
49
    BdrvChild *c, *next;
50

51
    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
52
        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
53 54
            continue;
        }
55 56 57
        if (c->role->drained_begin) {
            c->role->drained_begin(c);
        }
58 59
    }
}
60

61 62
void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
                             bool ignore_bds_parents)
63
{
64
    BdrvChild *c, *next;
65

66
    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
67
        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
68 69
            continue;
        }
70 71 72
        if (c->role->drained_end) {
            c->role->drained_end(c);
        }
73
    }
74 75
}

76 77
static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
                                     bool ignore_bds_parents)
78 79 80 81 82
{
    BdrvChild *c, *next;
    bool busy = false;

    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
83
        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
84 85 86 87 88 89 90 91 92 93
            continue;
        }
        if (c->role->drained_poll) {
            busy |= c->role->drained_poll(c);
        }
    }

    return busy;
}

94 95 96 97 98 99 100 101 102 103 104
static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
{
    dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
    dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
    dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
                                 src->opt_mem_alignment);
    dst->min_mem_alignment = MAX(dst->min_mem_alignment,
                                 src->min_mem_alignment);
    dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
}

105 106 107 108 109 110 111 112 113 114 115
void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
{
    BlockDriver *drv = bs->drv;
    Error *local_err = NULL;

    memset(&bs->bl, 0, sizeof(bs->bl));

    if (!drv) {
        return;
    }

116
    /* Default alignment based on whether driver has byte interface */
117 118
    bs->bl.request_alignment = (drv->bdrv_co_preadv ||
                                drv->bdrv_aio_preadv) ? 1 : 512;
119

120 121
    /* Take some limits from the children as a default */
    if (bs->file) {
K
Kevin Wolf 已提交
122
        bdrv_refresh_limits(bs->file->bs, &local_err);
123 124 125 126
        if (local_err) {
            error_propagate(errp, local_err);
            return;
        }
127
        bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
128
    } else {
129
        bs->bl.min_mem_alignment = 512;
130
        bs->bl.opt_mem_alignment = getpagesize();
131 132 133

        /* Safe default since most protocols use readv()/writev()/etc */
        bs->bl.max_iov = IOV_MAX;
134 135
    }

136 137
    if (bs->backing) {
        bdrv_refresh_limits(bs->backing->bs, &local_err);
138 139 140 141
        if (local_err) {
            error_propagate(errp, local_err);
            return;
        }
142
        bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
    }

    /* Then let the driver override it */
    if (drv->bdrv_refresh_limits) {
        drv->bdrv_refresh_limits(bs, errp);
    }
}

/**
 * The copy-on-read flag is actually a reference count so multiple users may
 * use the feature without worrying about clobbering its previous state.
 * Copy-on-read stays enabled until all users have called to disable it.
 */
void bdrv_enable_copy_on_read(BlockDriverState *bs)
{
158
    atomic_inc(&bs->copy_on_read);
159 160 161 162
}

void bdrv_disable_copy_on_read(BlockDriverState *bs)
{
163 164
    int old = atomic_fetch_dec(&bs->copy_on_read);
    assert(old >= 1);
165 166
}

167 168 169 170
typedef struct {
    Coroutine *co;
    BlockDriverState *bs;
    bool done;
171
    bool begin;
172
    bool recursive;
173
    bool poll;
174
    BdrvChild *parent;
175
    bool ignore_bds_parents;
176 177 178 179 180 181 182
} BdrvCoDrainData;

static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
{
    BdrvCoDrainData *data = opaque;
    BlockDriverState *bs = data->bs;

183
    if (data->begin) {
184
        bs->drv->bdrv_co_drain_begin(bs);
185 186 187
    } else {
        bs->drv->bdrv_co_drain_end(bs);
    }
188 189 190

    /* Set data->done before reading bs->wakeup.  */
    atomic_mb_set(&data->done, true);
191 192 193 194 195
    bdrv_dec_in_flight(bs);

    if (data->begin) {
        g_free(data);
    }
196 197
}

198
/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
199
static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
200
{
201
    BdrvCoDrainData *data;
202

203
    if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
204
            (!begin && !bs->drv->bdrv_co_drain_end)) {
205 206 207
        return;
    }

208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
    data = g_new(BdrvCoDrainData, 1);
    *data = (BdrvCoDrainData) {
        .bs = bs,
        .done = false,
        .begin = begin
    };

    /* Make sure the driver callback completes during the polling phase for
     * drain_begin. */
    bdrv_inc_in_flight(bs);
    data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
    aio_co_schedule(bdrv_get_aio_context(bs), data->co);

    if (!begin) {
        BDRV_POLL_WHILE(bs, !data->done);
        g_free(data);
    }
225 226
}

227
/* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
228
bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
229
                     BdrvChild *ignore_parent, bool ignore_bds_parents)
230
{
231 232
    BdrvChild *child, *next;

233
    if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
234 235 236
        return true;
    }

237 238 239 240 241
    if (atomic_read(&bs->in_flight)) {
        return true;
    }

    if (recursive) {
242
        assert(!ignore_bds_parents);
243
        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
244
            if (bdrv_drain_poll(child->bs, recursive, child, false)) {
245 246 247 248 249 250
                return true;
            }
        }
    }

    return false;
251 252
}

253
static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
254
                                      BdrvChild *ignore_parent)
255 256 257 258
{
    /* Execute pending BHs first and check everything else only after the BHs
     * have executed. */
    while (aio_poll(bs->aio_context, false));
259

260
    return bdrv_drain_poll(bs, recursive, ignore_parent, false);
261 262
}

263
static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
264 265
                                  BdrvChild *parent, bool ignore_bds_parents,
                                  bool poll);
266
static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
267
                                BdrvChild *parent, bool ignore_bds_parents);
268

F
Fam Zheng 已提交
269 270 271 272
static void bdrv_co_drain_bh_cb(void *opaque)
{
    BdrvCoDrainData *data = opaque;
    Coroutine *co = data->co;
273
    BlockDriverState *bs = data->bs;
F
Fam Zheng 已提交
274

275 276 277
    if (bs) {
        bdrv_dec_in_flight(bs);
        if (data->begin) {
278 279
            bdrv_do_drained_begin(bs, data->recursive, data->parent,
                                  data->ignore_bds_parents, data->poll);
280
        } else {
281 282
            bdrv_do_drained_end(bs, data->recursive, data->parent,
                                data->ignore_bds_parents);
283
        }
284
    } else {
285 286
        assert(data->begin);
        bdrv_drain_all_begin();
287 288
    }

F
Fam Zheng 已提交
289
    data->done = true;
290
    aio_co_wake(co);
F
Fam Zheng 已提交
291 292
}

293
static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
294
                                                bool begin, bool recursive,
295 296 297
                                                BdrvChild *parent,
                                                bool ignore_bds_parents,
                                                bool poll)
F
Fam Zheng 已提交
298 299 300 301
{
    BdrvCoDrainData data;

    /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
302
     * other coroutines run if they were queued by aio_co_enter(). */
F
Fam Zheng 已提交
303 304 305 306 307 308

    assert(qemu_in_coroutine());
    data = (BdrvCoDrainData) {
        .co = qemu_coroutine_self(),
        .bs = bs,
        .done = false,
309
        .begin = begin,
310
        .recursive = recursive,
311
        .parent = parent,
312
        .ignore_bds_parents = ignore_bds_parents,
313
        .poll = poll,
F
Fam Zheng 已提交
314
    };
315 316 317
    if (bs) {
        bdrv_inc_in_flight(bs);
    }
P
Paolo Bonzini 已提交
318 319
    aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
                            bdrv_co_drain_bh_cb, &data);
F
Fam Zheng 已提交
320 321 322 323 324 325 326

    qemu_coroutine_yield();
    /* If we are resumed from some other event (such as an aio completion or a
     * timer callback), it is a bug in the caller that should be fixed. */
    assert(data.done);
}

327
void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
328
                                   BdrvChild *parent, bool ignore_bds_parents)
329
{
330
    assert(!qemu_in_coroutine());
331

K
Kevin Wolf 已提交
332
    /* Stop things in parent-to-child order */
333
    if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
334 335 336
        aio_disable_external(bdrv_get_aio_context(bs));
    }

337
    bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
338
    bdrv_drain_invoke(bs, true);
339 340 341
}

static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
342 343
                                  BdrvChild *parent, bool ignore_bds_parents,
                                  bool poll)
344 345 346 347
{
    BdrvChild *child, *next;

    if (qemu_in_coroutine()) {
348 349
        bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
                               poll);
350 351 352
        return;
    }

353
    bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
K
Kevin Wolf 已提交
354

355
    if (recursive) {
356
        assert(!ignore_bds_parents);
357
        bs->recursive_quiesce_counter++;
358
        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
359 360
            bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
                                  false);
361 362
        }
    }
363 364 365 366 367 368 369 370 371 372 373

    /*
     * Wait for drained requests to finish.
     *
     * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
     * call is needed so things in this AioContext can make progress even
     * though we don't return to the main AioContext loop - this automatically
     * includes other nodes in the same AioContext and therefore all child
     * nodes.
     */
    if (poll) {
374
        assert(!ignore_bds_parents);
375 376
        BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
    }
377 378
}

379 380
void bdrv_drained_begin(BlockDriverState *bs)
{
381
    bdrv_do_drained_begin(bs, false, NULL, false, true);
382 383 384 385
}

void bdrv_subtree_drained_begin(BlockDriverState *bs)
{
386
    bdrv_do_drained_begin(bs, true, NULL, false, true);
387 388
}

389 390
static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
                                BdrvChild *parent, bool ignore_bds_parents)
391
{
392
    BdrvChild *child, *next;
393 394
    int old_quiesce_counter;

395
    if (qemu_in_coroutine()) {
396 397
        bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
                               false);
398 399
        return;
    }
400
    assert(bs->quiesce_counter > 0);
401
    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
402

K
Kevin Wolf 已提交
403
    /* Re-enable things in child-to-parent order */
404
    bdrv_drain_invoke(bs, false);
405
    bdrv_parent_drained_end(bs, parent, ignore_bds_parents);
406 407 408
    if (old_quiesce_counter == 1) {
        aio_enable_external(bdrv_get_aio_context(bs));
    }
409 410

    if (recursive) {
411
        assert(!ignore_bds_parents);
412
        bs->recursive_quiesce_counter--;
413
        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
414
            bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents);
415 416
        }
    }
417 418
}

419 420
void bdrv_drained_end(BlockDriverState *bs)
{
421
    bdrv_do_drained_end(bs, false, NULL, false);
422 423 424 425
}

void bdrv_subtree_drained_end(BlockDriverState *bs)
{
426
    bdrv_do_drained_end(bs, true, NULL, false);
427 428
}

429 430 431 432 433
void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
{
    int i;

    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
434
        bdrv_do_drained_begin(child->bs, true, child, false, true);
435 436 437 438 439 440 441 442
    }
}

void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
{
    int i;

    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
443
        bdrv_do_drained_end(child->bs, true, child, false);
444 445 446
    }
}

447
/*
448 449
 * Wait for pending requests to complete on a single BlockDriverState subtree,
 * and suspend block driver's internal I/O until next request arrives.
450 451 452 453
 *
 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
 * AioContext.
 */
454
void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
455
{
456 457 458
    assert(qemu_in_coroutine());
    bdrv_drained_begin(bs);
    bdrv_drained_end(bs);
459
}
460

461 462
void bdrv_drain(BlockDriverState *bs)
{
463 464
    bdrv_drained_begin(bs);
    bdrv_drained_end(bs);
465 466
}

467 468 469 470 471 472 473 474 475 476
static void bdrv_drain_assert_idle(BlockDriverState *bs)
{
    BdrvChild *child, *next;

    assert(atomic_read(&bs->in_flight) == 0);
    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
        bdrv_drain_assert_idle(child->bs);
    }
}

477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499
unsigned int bdrv_drain_all_count = 0;

static bool bdrv_drain_all_poll(void)
{
    BlockDriverState *bs = NULL;
    bool result = false;

    /* Execute pending BHs first (may modify the graph) and check everything
     * else only after the BHs have executed. */
    while (aio_poll(qemu_get_aio_context(), false));

    /* bdrv_drain_poll() can't make changes to the graph and we are holding the
     * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
    while ((bs = bdrv_next_all_states(bs))) {
        AioContext *aio_context = bdrv_get_aio_context(bs);
        aio_context_acquire(aio_context);
        result |= bdrv_drain_poll(bs, false, NULL, true);
        aio_context_release(aio_context);
    }

    return result;
}

500 501 502 503 504
/*
 * Wait for pending requests to complete across all BlockDriverStates
 *
 * This function does not flush data to disk, use bdrv_flush_all() for that
 * after calling this function.
505 506 507 508 509 510
 *
 * This pauses all block jobs and disables external clients. It must
 * be paired with bdrv_drain_all_end().
 *
 * NOTE: no new block jobs or BlockDriverStates can be created between
 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
511
 */
512
void bdrv_drain_all_begin(void)
513
{
514
    BlockDriverState *bs = NULL;
515

516
    if (qemu_in_coroutine()) {
517
        bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
518 519 520
        return;
    }

521 522
    /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
     * loop AioContext, so make sure we're in the main context. */
523
    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
524 525
    assert(bdrv_drain_all_count < INT_MAX);
    bdrv_drain_all_count++;
526

527 528 529
    /* Quiesce all nodes, without polling in-flight requests yet. The graph
     * cannot change during this loop. */
    while ((bs = bdrv_next_all_states(bs))) {
530 531 532
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
533
        bdrv_do_drained_begin(bs, false, NULL, true, false);
534 535 536
        aio_context_release(aio_context);
    }

537 538 539 540
    /* Now poll the in-flight requests */
    AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll());

    while ((bs = bdrv_next_all_states(bs))) {
541
        bdrv_drain_assert_idle(bs);
542
    }
543 544 545 546
}

void bdrv_drain_all_end(void)
{
547
    BlockDriverState *bs = NULL;
548

549
    while ((bs = bdrv_next_all_states(bs))) {
550 551 552
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
553
        bdrv_do_drained_end(bs, false, NULL, true);
554 555
        aio_context_release(aio_context);
    }
556 557 558

    assert(bdrv_drain_all_count > 0);
    bdrv_drain_all_count--;
559 560
}

561 562 563 564 565 566
void bdrv_drain_all(void)
{
    bdrv_drain_all_begin();
    bdrv_drain_all_end();
}

567 568 569 570 571 572 573 574
/**
 * Remove an active request from the tracked requests list
 *
 * This function should be called when a tracked request is completing.
 */
static void tracked_request_end(BdrvTrackedRequest *req)
{
    if (req->serialising) {
575
        atomic_dec(&req->bs->serialising_in_flight);
576 577
    }

578
    qemu_co_mutex_lock(&req->bs->reqs_lock);
579 580
    QLIST_REMOVE(req, list);
    qemu_co_queue_restart_all(&req->wait_queue);
581
    qemu_co_mutex_unlock(&req->bs->reqs_lock);
582 583 584 585 586 587 588 589
}

/**
 * Add an active request to the tracked requests list
 */
static void tracked_request_begin(BdrvTrackedRequest *req,
                                  BlockDriverState *bs,
                                  int64_t offset,
590 591
                                  unsigned int bytes,
                                  enum BdrvTrackedRequestType type)
592 593 594 595 596
{
    *req = (BdrvTrackedRequest){
        .bs = bs,
        .offset         = offset,
        .bytes          = bytes,
597
        .type           = type,
598 599 600 601 602 603 604 605
        .co             = qemu_coroutine_self(),
        .serialising    = false,
        .overlap_offset = offset,
        .overlap_bytes  = bytes,
    };

    qemu_co_queue_init(&req->wait_queue);

606
    qemu_co_mutex_lock(&bs->reqs_lock);
607
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
608
    qemu_co_mutex_unlock(&bs->reqs_lock);
609 610 611 612 613 614 615 616 617
}

static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
{
    int64_t overlap_offset = req->offset & ~(align - 1);
    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
                               - overlap_offset;

    if (!req->serialising) {
618
        atomic_inc(&req->bs->serialising_in_flight);
619 620 621 622 623 624 625
        req->serialising = true;
    }

    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
}

626 627 628 629
/**
 * Round a region to cluster boundaries
 */
void bdrv_round_to_clusters(BlockDriverState *bs,
630
                            int64_t offset, int64_t bytes,
631
                            int64_t *cluster_offset,
632
                            int64_t *cluster_bytes)
633 634 635 636 637 638 639 640 641 642 643 644 645
{
    BlockDriverInfo bdi;

    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
        *cluster_offset = offset;
        *cluster_bytes = bytes;
    } else {
        int64_t c = bdi.cluster_size;
        *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
    }
}

646 647 648 649 650 651 652
static int bdrv_get_cluster_size(BlockDriverState *bs)
{
    BlockDriverInfo bdi;
    int ret;

    ret = bdrv_get_info(bs, &bdi);
    if (ret < 0 || bdi.cluster_size == 0) {
653
        return bs->bl.request_alignment;
654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
    } else {
        return bdi.cluster_size;
    }
}

static bool tracked_request_overlaps(BdrvTrackedRequest *req,
                                     int64_t offset, unsigned int bytes)
{
    /*        aaaa   bbbb */
    if (offset >= req->overlap_offset + req->overlap_bytes) {
        return false;
    }
    /* bbbb   aaaa        */
    if (req->overlap_offset >= offset + bytes) {
        return false;
    }
    return true;
}

673 674 675 676 677
void bdrv_inc_in_flight(BlockDriverState *bs)
{
    atomic_inc(&bs->in_flight);
}

678 679
void bdrv_wakeup(BlockDriverState *bs)
{
680
    aio_wait_kick(bdrv_get_aio_wait(bs));
681
    aio_wait_kick(&drain_all_aio_wait);
682 683
}

684 685 686
void bdrv_dec_in_flight(BlockDriverState *bs)
{
    atomic_dec(&bs->in_flight);
687
    bdrv_wakeup(bs);
688 689
}

690 691 692 693 694 695 696
static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
{
    BlockDriverState *bs = self->bs;
    BdrvTrackedRequest *req;
    bool retry;
    bool waited = false;

697
    if (!atomic_read(&bs->serialising_in_flight)) {
698 699 700 701 702
        return false;
    }

    do {
        retry = false;
703
        qemu_co_mutex_lock(&bs->reqs_lock);
704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
            if (req == self || (!req->serialising && !self->serialising)) {
                continue;
            }
            if (tracked_request_overlaps(req, self->overlap_offset,
                                         self->overlap_bytes))
            {
                /* Hitting this means there was a reentrant request, for
                 * example, a block driver issuing nested requests.  This must
                 * never happen since it means deadlock.
                 */
                assert(qemu_coroutine_self() != req->co);

                /* If the request is already (indirectly) waiting for us, or
                 * will wait for us as soon as it wakes up, then just go on
                 * (instead of producing a deadlock in the former case). */
                if (!req->waiting_for) {
                    self->waiting_for = req;
722
                    qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
723 724 725 726 727 728 729
                    self->waiting_for = NULL;
                    retry = true;
                    waited = true;
                    break;
                }
            }
        }
730
        qemu_co_mutex_unlock(&bs->reqs_lock);
731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754
    } while (retry);

    return waited;
}

static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
                                   size_t size)
{
    if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
        return -EIO;
    }

    if (!bdrv_is_inserted(bs)) {
        return -ENOMEDIUM;
    }

    if (offset < 0) {
        return -EIO;
    }

    return 0;
}

typedef struct RwCo {
755
    BdrvChild *child;
756 757 758 759 760 761 762 763 764 765 766 767
    int64_t offset;
    QEMUIOVector *qiov;
    bool is_write;
    int ret;
    BdrvRequestFlags flags;
} RwCo;

static void coroutine_fn bdrv_rw_co_entry(void *opaque)
{
    RwCo *rwco = opaque;

    if (!rwco->is_write) {
768
        rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
769 770
                                   rwco->qiov->size, rwco->qiov,
                                   rwco->flags);
771
    } else {
772
        rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
773 774
                                    rwco->qiov->size, rwco->qiov,
                                    rwco->flags);
775 776 777 778 779 780
    }
}

/*
 * Process a vectored synchronous request using coroutines
 */
781
static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
782 783 784 785 786
                        QEMUIOVector *qiov, bool is_write,
                        BdrvRequestFlags flags)
{
    Coroutine *co;
    RwCo rwco = {
787
        .child = child,
788 789 790 791 792 793 794 795 796 797 798
        .offset = offset,
        .qiov = qiov,
        .is_write = is_write,
        .ret = NOT_DONE,
        .flags = flags,
    };

    if (qemu_in_coroutine()) {
        /* Fast-path if already in coroutine context */
        bdrv_rw_co_entry(&rwco);
    } else {
799
        co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
800
        bdrv_coroutine_enter(child->bs, co);
P
Paolo Bonzini 已提交
801
        BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
802 803 804 805 806 807 808
    }
    return rwco.ret;
}

/*
 * Process a synchronous request using coroutines
 */
809
static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
810 811 812 813 814 815 816 817 818 819 820 821 822
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
{
    QEMUIOVector qiov;
    struct iovec iov = {
        .iov_base = (void *)buf,
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
    };

    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
        return -EINVAL;
    }

    qemu_iovec_init_external(&qiov, &iov, 1);
823
    return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
824 825 826 827
                        &qiov, is_write, flags);
}

/* return < 0 if error. See bdrv_write() for the return codes */
828
int bdrv_read(BdrvChild *child, int64_t sector_num,
829 830
              uint8_t *buf, int nb_sectors)
{
831
    return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
832 833 834 835 836 837 838 839
}

/* Return < 0 if error. Important errors are:
  -EIO         generic I/O error (may happen for all errors)
  -ENOMEDIUM   No media inserted.
  -EINVAL      Invalid sector number or nb_sectors
  -EACCES      Trying to write a read-only device
*/
840
int bdrv_write(BdrvChild *child, int64_t sector_num,
841 842
               const uint8_t *buf, int nb_sectors)
{
843
    return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
844 845
}

846
int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
847
                       int bytes, BdrvRequestFlags flags)
848
{
849 850 851
    QEMUIOVector qiov;
    struct iovec iov = {
        .iov_base = NULL,
852
        .iov_len = bytes,
853 854 855
    };

    qemu_iovec_init_external(&qiov, &iov, 1);
856
    return bdrv_prwv_co(child, offset, &qiov, true,
857
                        BDRV_REQ_ZERO_WRITE | flags);
858 859 860
}

/*
861
 * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
862 863
 * The operation is sped up by checking the block status and only writing
 * zeroes to the device if they currently do not return zeroes. Optional
864
 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
865
 * BDRV_REQ_FUA).
866 867 868
 *
 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
 */
869
int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
870
{
871 872
    int ret;
    int64_t target_size, bytes, offset = 0;
873
    BlockDriverState *bs = child->bs;
874

875 876 877
    target_size = bdrv_getlength(bs);
    if (target_size < 0) {
        return target_size;
878 879 880
    }

    for (;;) {
881 882
        bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
        if (bytes <= 0) {
883 884
            return 0;
        }
885
        ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
886
        if (ret < 0) {
887 888
            error_report("error getting block status at offset %" PRId64 ": %s",
                         offset, strerror(-ret));
889 890 891
            return ret;
        }
        if (ret & BDRV_BLOCK_ZERO) {
892
            offset += bytes;
893 894
            continue;
        }
895
        ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
896
        if (ret < 0) {
897 898
            error_report("error writing zeroes at offset %" PRId64 ": %s",
                         offset, strerror(-ret));
899 900
            return ret;
        }
901
        offset += bytes;
902 903 904
    }
}

905
int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
K
Kevin Wolf 已提交
906 907 908
{
    int ret;

909
    ret = bdrv_prwv_co(child, offset, qiov, false, 0);
K
Kevin Wolf 已提交
910 911 912 913 914 915 916
    if (ret < 0) {
        return ret;
    }

    return qiov->size;
}

917
int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
918 919 920 921 922 923 924 925 926 927 928 929
{
    QEMUIOVector qiov;
    struct iovec iov = {
        .iov_base = (void *)buf,
        .iov_len = bytes,
    };

    if (bytes < 0) {
        return -EINVAL;
    }

    qemu_iovec_init_external(&qiov, &iov, 1);
930
    return bdrv_preadv(child, offset, &qiov);
931 932
}

933
int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
934 935 936
{
    int ret;

937
    ret = bdrv_prwv_co(child, offset, qiov, true, 0);
938 939 940 941 942 943 944
    if (ret < 0) {
        return ret;
    }

    return qiov->size;
}

945
int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
946 947 948 949 950 951 952 953 954 955 956 957
{
    QEMUIOVector qiov;
    struct iovec iov = {
        .iov_base   = (void *) buf,
        .iov_len    = bytes,
    };

    if (bytes < 0) {
        return -EINVAL;
    }

    qemu_iovec_init_external(&qiov, &iov, 1);
958
    return bdrv_pwritev(child, offset, &qiov);
959 960 961 962 963 964 965 966
}

/*
 * Writes to the file and ensures that no writes are reordered across this
 * request (acts as a barrier)
 *
 * Returns 0 on success, -errno in error cases.
 */
967 968
int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
                     const void *buf, int count)
969 970 971
{
    int ret;

972
    ret = bdrv_pwrite(child, offset, buf, count);
973 974 975 976
    if (ret < 0) {
        return ret;
    }

977
    ret = bdrv_flush(child->bs);
978 979
    if (ret < 0) {
        return ret;
980 981 982 983 984
    }

    return 0;
}

985 986 987 988 989 990 991 992 993 994
typedef struct CoroutineIOCompletion {
    Coroutine *coroutine;
    int ret;
} CoroutineIOCompletion;

static void bdrv_co_io_em_complete(void *opaque, int ret)
{
    CoroutineIOCompletion *co = opaque;

    co->ret = ret;
995
    aio_co_wake(co->coroutine);
996 997
}

998 999 1000 1001 1002
static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
                                           uint64_t offset, uint64_t bytes,
                                           QEMUIOVector *qiov, int flags)
{
    BlockDriver *drv = bs->drv;
1003 1004 1005
    int64_t sector_num;
    unsigned int nb_sectors;

1006 1007
    assert(!(flags & ~BDRV_REQ_MASK));

M
Max Reitz 已提交
1008 1009 1010 1011
    if (!drv) {
        return -ENOMEDIUM;
    }

1012 1013 1014 1015
    if (drv->bdrv_co_preadv) {
        return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
    }

1016
    if (drv->bdrv_aio_preadv) {
1017 1018 1019 1020 1021
        BlockAIOCB *acb;
        CoroutineIOCompletion co = {
            .coroutine = qemu_coroutine_self(),
        };

1022 1023
        acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
                                   bdrv_co_io_em_complete, &co);
1024 1025 1026 1027 1028 1029 1030
        if (acb == NULL) {
            return -EIO;
        } else {
            qemu_coroutine_yield();
            return co.ret;
        }
    }
1031 1032 1033 1034 1035 1036 1037 1038 1039 1040

    sector_num = offset >> BDRV_SECTOR_BITS;
    nb_sectors = bytes >> BDRV_SECTOR_BITS;

    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
    assert(drv->bdrv_co_readv);

    return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1041 1042
}

1043 1044 1045 1046 1047
static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
                                            uint64_t offset, uint64_t bytes,
                                            QEMUIOVector *qiov, int flags)
{
    BlockDriver *drv = bs->drv;
1048 1049
    int64_t sector_num;
    unsigned int nb_sectors;
1050 1051
    int ret;

1052 1053
    assert(!(flags & ~BDRV_REQ_MASK));

M
Max Reitz 已提交
1054 1055 1056 1057
    if (!drv) {
        return -ENOMEDIUM;
    }

1058
    if (drv->bdrv_co_pwritev) {
1059 1060 1061
        ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
                                   flags & bs->supported_write_flags);
        flags &= ~bs->supported_write_flags;
1062 1063 1064
        goto emulate_flags;
    }

1065
    if (drv->bdrv_aio_pwritev) {
1066 1067 1068 1069 1070
        BlockAIOCB *acb;
        CoroutineIOCompletion co = {
            .coroutine = qemu_coroutine_self(),
        };

1071 1072 1073 1074
        acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
                                    flags & bs->supported_write_flags,
                                    bdrv_co_io_em_complete, &co);
        flags &= ~bs->supported_write_flags;
1075
        if (acb == NULL) {
1076
            ret = -EIO;
1077 1078
        } else {
            qemu_coroutine_yield();
1079
            ret = co.ret;
1080
        }
1081 1082 1083 1084 1085 1086 1087 1088 1089 1090
        goto emulate_flags;
    }

    sector_num = offset >> BDRV_SECTOR_BITS;
    nb_sectors = bytes >> BDRV_SECTOR_BITS;

    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);

1091 1092 1093 1094
    assert(drv->bdrv_co_writev);
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
                              flags & bs->supported_write_flags);
    flags &= ~bs->supported_write_flags;
1095

1096
emulate_flags:
1097
    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
1098 1099 1100 1101 1102 1103
        ret = bdrv_co_flush(bs);
    }

    return ret;
}

1104 1105 1106 1107 1108 1109
static int coroutine_fn
bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
                               uint64_t bytes, QEMUIOVector *qiov)
{
    BlockDriver *drv = bs->drv;

M
Max Reitz 已提交
1110 1111 1112 1113
    if (!drv) {
        return -ENOMEDIUM;
    }

1114 1115 1116 1117 1118 1119 1120
    if (!drv->bdrv_co_pwritev_compressed) {
        return -ENOTSUP;
    }

    return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
}

1121
static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
1122
        int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
1123
{
1124 1125
    BlockDriverState *bs = child->bs;

1126 1127 1128 1129 1130 1131 1132 1133 1134
    /* Perform I/O through a temporary buffer so that users who scribble over
     * their read buffer while the operation is in progress do not end up
     * modifying the image file.  This is critical for zero-copy guest I/O
     * where anything might happen inside guest memory.
     */
    void *bounce_buffer;

    BlockDriver *drv = bs->drv;
    struct iovec iov;
E
Eric Blake 已提交
1135
    QEMUIOVector local_qiov;
1136
    int64_t cluster_offset;
1137
    int64_t cluster_bytes;
1138 1139
    size_t skip_bytes;
    int ret;
E
Eric Blake 已提交
1140 1141 1142
    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
                                    BDRV_REQUEST_MAX_BYTES);
    unsigned int progress = 0;
1143

M
Max Reitz 已提交
1144 1145 1146 1147
    if (!drv) {
        return -ENOMEDIUM;
    }

1148 1149 1150
    /* FIXME We cannot require callers to have write permissions when all they
     * are doing is a read request. If we did things right, write permissions
     * would be obtained anyway, but internally by the copy-on-read code. As
E
Eric Blake 已提交
1151
     * long as it is implemented here rather than in a separate filter driver,
1152 1153 1154 1155
     * the copy-on-read code doesn't have its own BdrvChild, however, for which
     * it could request permissions. Therefore we have to bypass the permission
     * system for the moment. */
    // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1156

1157
    /* Cover entire cluster so no additional backing file I/O is required when
E
Eric Blake 已提交
1158 1159 1160
     * allocating cluster in the image file.  Note that this value may exceed
     * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
     * is one reason we loop rather than doing it all at once.
1161
     */
1162
    bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
E
Eric Blake 已提交
1163
    skip_bytes = offset - cluster_offset;
1164

1165 1166
    trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
                                   cluster_offset, cluster_bytes);
1167

E
Eric Blake 已提交
1168 1169 1170
    bounce_buffer = qemu_try_blockalign(bs,
                                        MIN(MIN(max_transfer, cluster_bytes),
                                            MAX_BOUNCE_BUFFER));
1171 1172 1173 1174 1175
    if (bounce_buffer == NULL) {
        ret = -ENOMEM;
        goto err;
    }

E
Eric Blake 已提交
1176 1177
    while (cluster_bytes) {
        int64_t pnum;
1178

E
Eric Blake 已提交
1179 1180 1181 1182 1183 1184 1185 1186 1187
        ret = bdrv_is_allocated(bs, cluster_offset,
                                MIN(cluster_bytes, max_transfer), &pnum);
        if (ret < 0) {
            /* Safe to treat errors in querying allocation as if
             * unallocated; we'll probably fail again soon on the
             * read, but at least that will set a decent errno.
             */
            pnum = MIN(cluster_bytes, max_transfer);
        }
1188

E
Eric Blake 已提交
1189
        assert(skip_bytes < pnum);
1190

E
Eric Blake 已提交
1191 1192 1193 1194 1195
        if (ret <= 0) {
            /* Must copy-on-read; use the bounce buffer */
            iov.iov_base = bounce_buffer;
            iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
            qemu_iovec_init_external(&local_qiov, &iov, 1);
1196

E
Eric Blake 已提交
1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208
            ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
                                     &local_qiov, 0);
            if (ret < 0) {
                goto err;
            }

            bdrv_debug_event(bs, BLKDBG_COR_WRITE);
            if (drv->bdrv_co_pwrite_zeroes &&
                buffer_is_zero(bounce_buffer, pnum)) {
                /* FIXME: Should we (perhaps conditionally) be setting
                 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
                 * that still correctly reads as zero? */
1209 1210
                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
                                               BDRV_REQ_WRITE_UNCHANGED);
E
Eric Blake 已提交
1211 1212 1213 1214 1215
            } else {
                /* This does not change the data on the disk, it is not
                 * necessary to flush even in cache=writethrough mode.
                 */
                ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
1216 1217
                                          &local_qiov,
                                          BDRV_REQ_WRITE_UNCHANGED);
E
Eric Blake 已提交
1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248
            }

            if (ret < 0) {
                /* It might be okay to ignore write errors for guest
                 * requests.  If this is a deliberate copy-on-read
                 * then we don't want to ignore the error.  Simply
                 * report it in all cases.
                 */
                goto err;
            }

            qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes,
                                pnum - skip_bytes);
        } else {
            /* Read directly into the destination */
            qemu_iovec_init(&local_qiov, qiov->niov);
            qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes);
            ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size,
                                     &local_qiov, 0);
            qemu_iovec_destroy(&local_qiov);
            if (ret < 0) {
                goto err;
            }
        }

        cluster_offset += pnum;
        cluster_bytes -= pnum;
        progress += pnum - skip_bytes;
        skip_bytes = 0;
    }
    ret = 0;
1249 1250 1251 1252 1253 1254 1255 1256

err:
    qemu_vfree(bounce_buffer);
    return ret;
}

/*
 * Forwards an already correctly aligned request to the BlockDriver. This
1257 1258
 * handles copy on read, zeroing after EOF, and fragmentation of large
 * reads; any other features must be implemented by the caller.
1259
 */
1260
static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
1261 1262 1263
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
    int64_t align, QEMUIOVector *qiov, int flags)
{
1264
    BlockDriverState *bs = child->bs;
K
Kevin Wolf 已提交
1265
    int64_t total_bytes, max_bytes;
1266 1267 1268
    int ret = 0;
    uint64_t bytes_remaining = bytes;
    int max_transfer;
1269

1270 1271 1272
    assert(is_power_of_2(align));
    assert((offset & (align - 1)) == 0);
    assert((bytes & (align - 1)) == 0);
1273
    assert(!qiov || bytes == qiov->size);
1274
    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1275 1276
    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
                                   align);
1277 1278 1279 1280 1281 1282

    /* TODO: We would need a per-BDS .supported_read_flags and
     * potential fallback support, if we ever implement any read flags
     * to pass through to drivers.  For now, there aren't any
     * passthrough flags.  */
    assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293

    /* Handle Copy on Read and associated serialisation */
    if (flags & BDRV_REQ_COPY_ON_READ) {
        /* If we touch the same cluster it counts as an overlap.  This
         * guarantees that allocating writes will be serialized and not race
         * with each other for the same cluster.  For example, in copy-on-read
         * it ensures that the CoR read and write operations are atomic and
         * guest writes cannot interleave between them. */
        mark_request_serialising(req, bdrv_get_cluster_size(bs));
    }

1294 1295 1296
    if (!(flags & BDRV_REQ_NO_SERIALISING)) {
        wait_serialising_requests(req);
    }
1297 1298

    if (flags & BDRV_REQ_COPY_ON_READ) {
1299
        int64_t pnum;
1300

1301
        ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
1302 1303 1304 1305
        if (ret < 0) {
            goto out;
        }

1306
        if (!ret || pnum != bytes) {
1307
            ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
1308 1309 1310 1311
            goto out;
        }
    }

1312
    /* Forward the request to the BlockDriver, possibly fragmenting it */
K
Kevin Wolf 已提交
1313 1314 1315 1316 1317
    total_bytes = bdrv_getlength(bs);
    if (total_bytes < 0) {
        ret = total_bytes;
        goto out;
    }
1318

K
Kevin Wolf 已提交
1319
    max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1320
    if (bytes <= max_bytes && bytes <= max_transfer) {
K
Kevin Wolf 已提交
1321
        ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1322 1323
        goto out;
    }
1324

1325 1326
    while (bytes_remaining) {
        int num;
1327

1328 1329
        if (max_bytes) {
            QEMUIOVector local_qiov;
1330

1331 1332 1333 1334
            num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
            assert(num);
            qemu_iovec_init(&local_qiov, qiov->niov);
            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1335

1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348
            ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
                                     num, &local_qiov, 0);
            max_bytes -= num;
            qemu_iovec_destroy(&local_qiov);
        } else {
            num = bytes_remaining;
            ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
                                    bytes_remaining);
        }
        if (ret < 0) {
            goto out;
        }
        bytes_remaining -= num;
1349 1350 1351
    }

out:
1352
    return ret < 0 ? ret : 0;
1353 1354 1355 1356 1357
}

/*
 * Handle a read request in coroutine context
 */
1358
int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1359 1360 1361
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
    BdrvRequestFlags flags)
{
1362
    BlockDriverState *bs = child->bs;
1363 1364 1365
    BlockDriver *drv = bs->drv;
    BdrvTrackedRequest req;

1366
    uint64_t align = bs->bl.request_alignment;
1367 1368 1369 1370 1371 1372
    uint8_t *head_buf = NULL;
    uint8_t *tail_buf = NULL;
    QEMUIOVector local_qiov;
    bool use_local_qiov = false;
    int ret;

1373 1374
    trace_bdrv_co_preadv(child->bs, offset, bytes, flags);

1375 1376 1377 1378 1379 1380 1381 1382 1383
    if (!drv) {
        return -ENOMEDIUM;
    }

    ret = bdrv_check_byte_request(bs, offset, bytes);
    if (ret < 0) {
        return ret;
    }

1384 1385
    bdrv_inc_in_flight(bs);

1386
    /* Don't do copy-on-read if we read data before write operation */
1387
    if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415
        flags |= BDRV_REQ_COPY_ON_READ;
    }

    /* Align read if necessary by padding qiov */
    if (offset & (align - 1)) {
        head_buf = qemu_blockalign(bs, align);
        qemu_iovec_init(&local_qiov, qiov->niov + 2);
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
        use_local_qiov = true;

        bytes += offset & (align - 1);
        offset = offset & ~(align - 1);
    }

    if ((offset + bytes) & (align - 1)) {
        if (!use_local_qiov) {
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
            use_local_qiov = true;
        }
        tail_buf = qemu_blockalign(bs, align);
        qemu_iovec_add(&local_qiov, tail_buf,
                       align - ((offset + bytes) & (align - 1)));

        bytes = ROUND_UP(bytes, align);
    }

1416
    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1417
    ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
1418 1419 1420
                              use_local_qiov ? &local_qiov : qiov,
                              flags);
    tracked_request_end(&req);
1421
    bdrv_dec_in_flight(bs);
1422 1423 1424 1425 1426 1427 1428 1429 1430 1431

    if (use_local_qiov) {
        qemu_iovec_destroy(&local_qiov);
        qemu_vfree(head_buf);
        qemu_vfree(tail_buf);
    }

    return ret;
}

E
Eric Blake 已提交
1432
static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1433
    int64_t offset, int bytes, BdrvRequestFlags flags)
1434 1435 1436 1437 1438
{
    BlockDriver *drv = bs->drv;
    QEMUIOVector qiov;
    struct iovec iov = {0};
    int ret = 0;
1439
    bool need_flush = false;
1440 1441
    int head = 0;
    int tail = 0;
1442

1443
    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1444 1445
    int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
                        bs->bl.request_alignment);
E
Eric Blake 已提交
1446
    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
E
Eric Blake 已提交
1447

M
Max Reitz 已提交
1448 1449 1450 1451
    if (!drv) {
        return -ENOMEDIUM;
    }

1452 1453
    assert(alignment % bs->bl.request_alignment == 0);
    head = offset % alignment;
1454
    tail = (offset + bytes) % alignment;
1455 1456
    max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
    assert(max_write_zeroes >= bs->bl.request_alignment);
1457

1458 1459
    while (bytes > 0 && !ret) {
        int num = bytes;
1460 1461

        /* Align request.  Block drivers can expect the "bulk" of the request
1462 1463
         * to be aligned, and that unaligned requests do not cross cluster
         * boundaries.
1464
         */
1465
        if (head) {
1466 1467 1468
            /* Make a small request up to the first aligned sector. For
             * convenience, limit this request to max_transfer even if
             * we don't need to fall back to writes.  */
1469
            num = MIN(MIN(bytes, max_transfer), alignment - head);
1470 1471
            head = (head + num) % alignment;
            assert(num < max_write_zeroes);
E
Eric Blake 已提交
1472
        } else if (tail && num > alignment) {
1473 1474
            /* Shorten the request to the last aligned sector.  */
            num -= tail;
1475 1476 1477 1478 1479 1480 1481 1482 1483
        }

        /* limit request size */
        if (num > max_write_zeroes) {
            num = max_write_zeroes;
        }

        ret = -ENOTSUP;
        /* First try the efficient write zeroes operation */
E
Eric Blake 已提交
1484 1485 1486 1487 1488 1489 1490
        if (drv->bdrv_co_pwrite_zeroes) {
            ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
                                             flags & bs->supported_zero_flags);
            if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
                !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
                need_flush = true;
            }
1491 1492
        } else {
            assert(!bs->supported_zero_flags);
1493 1494 1495 1496
        }

        if (ret == -ENOTSUP) {
            /* Fall back to bounce buffer if write zeroes is unsupported */
1497 1498 1499 1500 1501 1502 1503 1504 1505
            BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;

            if ((flags & BDRV_REQ_FUA) &&
                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
                /* No need for bdrv_driver_pwrite() to do a fallback
                 * flush on each chunk; use just one at the end */
                write_flags &= ~BDRV_REQ_FUA;
                need_flush = true;
            }
1506
            num = MIN(num, max_transfer);
E
Eric Blake 已提交
1507
            iov.iov_len = num;
1508
            if (iov.iov_base == NULL) {
E
Eric Blake 已提交
1509
                iov.iov_base = qemu_try_blockalign(bs, num);
1510 1511 1512 1513
                if (iov.iov_base == NULL) {
                    ret = -ENOMEM;
                    goto fail;
                }
E
Eric Blake 已提交
1514
                memset(iov.iov_base, 0, num);
1515 1516 1517
            }
            qemu_iovec_init_external(&qiov, &iov, 1);

E
Eric Blake 已提交
1518
            ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
1519 1520 1521 1522

            /* Keep bounce buffer around if it is big enough for all
             * all future requests.
             */
1523
            if (num < max_transfer) {
1524 1525 1526 1527 1528
                qemu_vfree(iov.iov_base);
                iov.iov_base = NULL;
            }
        }

E
Eric Blake 已提交
1529
        offset += num;
1530
        bytes -= num;
1531 1532 1533
    }

fail:
1534 1535 1536
    if (ret == 0 && need_flush) {
        ret = bdrv_co_flush(bs);
    }
1537 1538 1539 1540 1541
    qemu_vfree(iov.iov_base);
    return ret;
}

/*
1542 1543
 * Forwards an already correctly aligned write request to the BlockDriver,
 * after possibly fragmenting it.
1544
 */
1545
static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
1546
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1547
    int64_t align, QEMUIOVector *qiov, int flags)
1548
{
1549
    BlockDriverState *bs = child->bs;
1550 1551 1552 1553
    BlockDriver *drv = bs->drv;
    bool waited;
    int ret;

1554
    int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1555 1556
    uint64_t bytes_remaining = bytes;
    int max_transfer;
1557

M
Max Reitz 已提交
1558 1559 1560 1561
    if (!drv) {
        return -ENOMEDIUM;
    }

1562 1563 1564 1565
    if (bdrv_has_readonly_bitmaps(bs)) {
        return -EPERM;
    }

1566 1567 1568
    assert(is_power_of_2(align));
    assert((offset & (align - 1)) == 0);
    assert((bytes & (align - 1)) == 0);
1569
    assert(!qiov || bytes == qiov->size);
1570
    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1571
    assert(!(flags & ~BDRV_REQ_MASK));
1572 1573
    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
                                   align);
1574 1575 1576 1577 1578

    waited = wait_serialising_requests(req);
    assert(!waited || !req->serialising);
    assert(req->overlap_offset <= offset);
    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1579 1580 1581 1582 1583
    if (flags & BDRV_REQ_WRITE_UNCHANGED) {
        assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
    } else {
        assert(child->perm & BLK_PERM_WRITE);
    }
1584
    assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
1585 1586 1587 1588

    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);

    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
E
Eric Blake 已提交
1589
        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1590 1591 1592 1593 1594 1595 1596 1597 1598 1599
        qemu_iovec_is_zero(qiov)) {
        flags |= BDRV_REQ_ZERO_WRITE;
        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
            flags |= BDRV_REQ_MAY_UNMAP;
        }
    }

    if (ret < 0) {
        /* Do nothing, write notifier decided to fail this request */
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
K
Kevin Wolf 已提交
1600
        bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1601
        ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
1602 1603
    } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
        ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
1604
    } else if (bytes <= max_transfer) {
K
Kevin Wolf 已提交
1605
        bdrv_debug_event(bs, BLKDBG_PWRITEV);
1606
        ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631
    } else {
        bdrv_debug_event(bs, BLKDBG_PWRITEV);
        while (bytes_remaining) {
            int num = MIN(bytes_remaining, max_transfer);
            QEMUIOVector local_qiov;
            int local_flags = flags;

            assert(num);
            if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
                /* If FUA is going to be emulated by flush, we only
                 * need to flush on the last iteration */
                local_flags &= ~BDRV_REQ_FUA;
            }
            qemu_iovec_init(&local_qiov, qiov->niov);
            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);

            ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
                                      num, &local_qiov, local_flags);
            qemu_iovec_destroy(&local_qiov);
            if (ret < 0) {
                break;
            }
            bytes_remaining -= num;
        }
1632
    }
K
Kevin Wolf 已提交
1633
    bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1634

1635
    atomic_inc(&bs->write_gen);
1636
    bdrv_set_dirty(bs, offset, bytes);
1637

1638
    stat64_max(&bs->wr_highest_offset, offset + bytes);
1639 1640

    if (ret >= 0) {
1641
        bs->total_sectors = MAX(bs->total_sectors, end_sector);
1642
        ret = 0;
1643 1644 1645 1646 1647
    }

    return ret;
}

1648
static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
1649 1650 1651 1652 1653
                                                int64_t offset,
                                                unsigned int bytes,
                                                BdrvRequestFlags flags,
                                                BdrvTrackedRequest *req)
{
1654
    BlockDriverState *bs = child->bs;
1655 1656 1657
    uint8_t *buf = NULL;
    QEMUIOVector local_qiov;
    struct iovec iov;
1658
    uint64_t align = bs->bl.request_alignment;
1659 1660 1661 1662
    unsigned int head_padding_bytes, tail_padding_bytes;
    int ret = 0;

    head_padding_bytes = offset & (align - 1);
1663
    tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680


    assert(flags & BDRV_REQ_ZERO_WRITE);
    if (head_padding_bytes || tail_padding_bytes) {
        buf = qemu_blockalign(bs, align);
        iov = (struct iovec) {
            .iov_base   = buf,
            .iov_len    = align,
        };
        qemu_iovec_init_external(&local_qiov, &iov, 1);
    }
    if (head_padding_bytes) {
        uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);

        /* RMW the unaligned part before head. */
        mark_request_serialising(req, align);
        wait_serialising_requests(req);
K
Kevin Wolf 已提交
1681
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1682
        ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
1683 1684 1685 1686
                                  align, &local_qiov, 0);
        if (ret < 0) {
            goto fail;
        }
K
Kevin Wolf 已提交
1687
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1688 1689

        memset(buf + head_padding_bytes, 0, zero_bytes);
1690
        ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
1691
                                   align, &local_qiov,
1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703
                                   flags & ~BDRV_REQ_ZERO_WRITE);
        if (ret < 0) {
            goto fail;
        }
        offset += zero_bytes;
        bytes -= zero_bytes;
    }

    assert(!bytes || (offset & (align - 1)) == 0);
    if (bytes >= align) {
        /* Write the aligned part in the middle. */
        uint64_t aligned_bytes = bytes & ~(align - 1);
1704
        ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718
                                   NULL, flags);
        if (ret < 0) {
            goto fail;
        }
        bytes -= aligned_bytes;
        offset += aligned_bytes;
    }

    assert(!bytes || (offset & (align - 1)) == 0);
    if (bytes) {
        assert(align == tail_padding_bytes + bytes);
        /* RMW the unaligned part after tail. */
        mark_request_serialising(req, align);
        wait_serialising_requests(req);
K
Kevin Wolf 已提交
1719
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1720
        ret = bdrv_aligned_preadv(child, req, offset, align,
1721 1722 1723 1724
                                  align, &local_qiov, 0);
        if (ret < 0) {
            goto fail;
        }
K
Kevin Wolf 已提交
1725
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1726 1727

        memset(buf, 0, bytes);
1728
        ret = bdrv_aligned_pwritev(child, req, offset, align, align,
1729 1730 1731 1732 1733 1734 1735 1736
                                   &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
    }
fail:
    qemu_vfree(buf);
    return ret;

}

1737 1738 1739
/*
 * Handle a write request in coroutine context
 */
1740
int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
1741 1742 1743
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
    BdrvRequestFlags flags)
{
1744
    BlockDriverState *bs = child->bs;
1745
    BdrvTrackedRequest req;
1746
    uint64_t align = bs->bl.request_alignment;
1747 1748 1749 1750 1751 1752
    uint8_t *head_buf = NULL;
    uint8_t *tail_buf = NULL;
    QEMUIOVector local_qiov;
    bool use_local_qiov = false;
    int ret;

1753 1754
    trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);

1755 1756 1757 1758
    if (!bs->drv) {
        return -ENOMEDIUM;
    }
    if (bs->read_only) {
1759
        return -EPERM;
1760
    }
1761
    assert(!(bs->open_flags & BDRV_O_INACTIVE));
1762 1763 1764 1765 1766 1767

    ret = bdrv_check_byte_request(bs, offset, bytes);
    if (ret < 0) {
        return ret;
    }

1768
    bdrv_inc_in_flight(bs);
1769 1770 1771 1772 1773
    /*
     * Align write if necessary by performing a read-modify-write cycle.
     * Pad qiov with the read parts and be sure to have a tracked request not
     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
     */
1774
    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1775

1776
    if (flags & BDRV_REQ_ZERO_WRITE) {
1777
        ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
1778 1779 1780
        goto out;
    }

1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794
    if (offset & (align - 1)) {
        QEMUIOVector head_qiov;
        struct iovec head_iov;

        mark_request_serialising(&req, align);
        wait_serialising_requests(&req);

        head_buf = qemu_blockalign(bs, align);
        head_iov = (struct iovec) {
            .iov_base   = head_buf,
            .iov_len    = align,
        };
        qemu_iovec_init_external(&head_qiov, &head_iov, 1);

K
Kevin Wolf 已提交
1795
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1796
        ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
1797 1798 1799 1800
                                  align, &head_qiov, 0);
        if (ret < 0) {
            goto fail;
        }
K
Kevin Wolf 已提交
1801
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1802 1803 1804 1805 1806 1807 1808 1809

        qemu_iovec_init(&local_qiov, qiov->niov + 2);
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
        use_local_qiov = true;

        bytes += offset & (align - 1);
        offset = offset & ~(align - 1);
1810 1811 1812 1813 1814 1815 1816 1817

        /* We have read the tail already if the request is smaller
         * than one aligned block.
         */
        if (bytes < align) {
            qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
            bytes = align;
        }
1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836
    }

    if ((offset + bytes) & (align - 1)) {
        QEMUIOVector tail_qiov;
        struct iovec tail_iov;
        size_t tail_bytes;
        bool waited;

        mark_request_serialising(&req, align);
        waited = wait_serialising_requests(&req);
        assert(!waited || !use_local_qiov);

        tail_buf = qemu_blockalign(bs, align);
        tail_iov = (struct iovec) {
            .iov_base   = tail_buf,
            .iov_len    = align,
        };
        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);

K
Kevin Wolf 已提交
1837
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1838 1839
        ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
                                  align, align, &tail_qiov, 0);
1840 1841 1842
        if (ret < 0) {
            goto fail;
        }
K
Kevin Wolf 已提交
1843
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856

        if (!use_local_qiov) {
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
            use_local_qiov = true;
        }

        tail_bytes = (offset + bytes) & (align - 1);
        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);

        bytes = ROUND_UP(bytes, align);
    }

1857
    ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
1858 1859
                               use_local_qiov ? &local_qiov : qiov,
                               flags);
1860 1861 1862 1863 1864 1865 1866 1867

fail:

    if (use_local_qiov) {
        qemu_iovec_destroy(&local_qiov);
    }
    qemu_vfree(head_buf);
    qemu_vfree(tail_buf);
1868 1869
out:
    tracked_request_end(&req);
1870
    bdrv_dec_in_flight(bs);
1871 1872 1873
    return ret;
}

1874
int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1875
                                       int bytes, BdrvRequestFlags flags)
1876
{
1877
    trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
1878

1879
    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
1880 1881 1882
        flags &= ~BDRV_REQ_MAY_UNMAP;
    }

1883
    return bdrv_co_pwritev(child, offset, bytes, NULL,
1884
                           BDRV_REQ_ZERO_WRITE | flags);
1885 1886
}

J
John Snow 已提交
1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911
/*
 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
 */
int bdrv_flush_all(void)
{
    BdrvNextIterator it;
    BlockDriverState *bs = NULL;
    int result = 0;

    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *aio_context = bdrv_get_aio_context(bs);
        int ret;

        aio_context_acquire(aio_context);
        ret = bdrv_flush(bs);
        if (ret < 0 && !result) {
            result = ret;
        }
        aio_context_release(aio_context);
    }

    return result;
}


1912
typedef struct BdrvCoBlockStatusData {
1913 1914
    BlockDriverState *bs;
    BlockDriverState *base;
1915
    bool want_zero;
1916 1917 1918 1919
    int64_t offset;
    int64_t bytes;
    int64_t *pnum;
    int64_t *map;
1920
    BlockDriverState **file;
1921
    int ret;
1922
    bool done;
1923
} BdrvCoBlockStatusData;
1924

1925 1926 1927 1928 1929 1930 1931
int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
                                                bool want_zero,
                                                int64_t offset,
                                                int64_t bytes,
                                                int64_t *pnum,
                                                int64_t *map,
                                                BlockDriverState **file)
1932 1933
{
    assert(bs->file && bs->file->bs);
1934 1935
    *pnum = bytes;
    *map = offset;
1936
    *file = bs->file->bs;
1937
    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
1938 1939
}

1940 1941 1942 1943 1944 1945 1946
int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
                                                   bool want_zero,
                                                   int64_t offset,
                                                   int64_t bytes,
                                                   int64_t *pnum,
                                                   int64_t *map,
                                                   BlockDriverState **file)
1947 1948
{
    assert(bs->backing && bs->backing->bs);
1949 1950
    *pnum = bytes;
    *map = offset;
1951
    *file = bs->backing->bs;
1952
    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
1953 1954
}

1955 1956 1957 1958 1959
/*
 * Returns the allocation status of the specified sectors.
 * Drivers not implementing the functionality are assumed to not support
 * backing files, hence all their sectors are reported as allocated.
 *
1960 1961 1962 1963
 * If 'want_zero' is true, the caller is querying for mapping
 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
 * _ZERO where possible; otherwise, the result favors larger 'pnum',
 * with a focus on accurate BDRV_BLOCK_ALLOCATED.
1964
 *
1965
 * If 'offset' is beyond the end of the disk image the return value is
1966
 * BDRV_BLOCK_EOF and 'pnum' is set to 0.
1967
 *
1968
 * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
1969 1970
 * beyond the end of the disk image it will be clamped; if 'pnum' is set to
 * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
1971
 *
1972 1973 1974 1975 1976 1977 1978 1979 1980
 * 'pnum' is set to the number of bytes (including and immediately
 * following the specified offset) that are easily known to be in the
 * same allocated/unallocated state.  Note that a second call starting
 * at the original offset plus returned pnum may have the same status.
 * The returned value is non-zero on success except at end-of-file.
 *
 * Returns negative errno on failure.  Otherwise, if the
 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
 * set to the host mapping and BDS corresponding to the guest offset.
1981
 */
1982 1983 1984 1985 1986 1987 1988 1989
static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
                                             bool want_zero,
                                             int64_t offset, int64_t bytes,
                                             int64_t *pnum, int64_t *map,
                                             BlockDriverState **file)
{
    int64_t total_size;
    int64_t n; /* bytes */
E
Eric Blake 已提交
1990
    int ret;
1991
    int64_t local_map = 0;
1992
    BlockDriverState *local_file = NULL;
E
Eric Blake 已提交
1993 1994
    int64_t aligned_offset, aligned_bytes;
    uint32_t align;
1995

1996 1997
    assert(pnum);
    *pnum = 0;
1998 1999 2000
    total_size = bdrv_getlength(bs);
    if (total_size < 0) {
        ret = total_size;
2001
        goto early_out;
2002 2003
    }

2004
    if (offset >= total_size) {
2005 2006
        ret = BDRV_BLOCK_EOF;
        goto early_out;
2007
    }
2008
    if (!bytes) {
2009 2010
        ret = 0;
        goto early_out;
2011
    }
2012

2013 2014 2015
    n = total_size - offset;
    if (n < bytes) {
        bytes = n;
2016 2017
    }

M
Max Reitz 已提交
2018 2019
    /* Must be non-NULL or bdrv_getlength() would have failed */
    assert(bs->drv);
2020
    if (!bs->drv->bdrv_co_block_status) {
2021
        *pnum = bytes;
2022
        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
2023
        if (offset + bytes == total_size) {
2024 2025
            ret |= BDRV_BLOCK_EOF;
        }
2026
        if (bs->drv->protocol_name) {
2027 2028
            ret |= BDRV_BLOCK_OFFSET_VALID;
            local_map = offset;
2029
            local_file = bs;
2030
        }
2031
        goto early_out;
2032 2033
    }

2034
    bdrv_inc_in_flight(bs);
E
Eric Blake 已提交
2035 2036

    /* Round out to request_alignment boundaries */
2037
    align = bs->bl.request_alignment;
E
Eric Blake 已提交
2038 2039 2040
    aligned_offset = QEMU_ALIGN_DOWN(offset, align);
    aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;

2041 2042 2043 2044 2045 2046
    ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
                                        aligned_bytes, pnum, &local_map,
                                        &local_file);
    if (ret < 0) {
        *pnum = 0;
        goto out;
E
Eric Blake 已提交
2047 2048
    }

2049
    /*
2050
     * The driver's result must be a non-zero multiple of request_alignment.
E
Eric Blake 已提交
2051
     * Clamp pnum and adjust map to original request.
2052
     */
2053 2054
    assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
           align > offset - aligned_offset);
E
Eric Blake 已提交
2055 2056 2057
    *pnum -= offset - aligned_offset;
    if (*pnum > bytes) {
        *pnum = bytes;
2058
    }
2059
    if (ret & BDRV_BLOCK_OFFSET_VALID) {
E
Eric Blake 已提交
2060
        local_map += offset - aligned_offset;
2061
    }
2062 2063

    if (ret & BDRV_BLOCK_RAW) {
2064
        assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
2065 2066
        ret = bdrv_co_block_status(local_file, want_zero, local_map,
                                   *pnum, pnum, &local_map, &local_file);
2067
        goto out;
2068 2069 2070 2071
    }

    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
        ret |= BDRV_BLOCK_ALLOCATED;
2072
    } else if (want_zero) {
2073 2074
        if (bdrv_unallocated_blocks_are_zero(bs)) {
            ret |= BDRV_BLOCK_ZERO;
2075 2076
        } else if (bs->backing) {
            BlockDriverState *bs2 = bs->backing->bs;
2077
            int64_t size2 = bdrv_getlength(bs2);
2078

2079
            if (size2 >= 0 && offset >= size2) {
2080 2081 2082 2083 2084
                ret |= BDRV_BLOCK_ZERO;
            }
        }
    }

2085
    if (want_zero && local_file && local_file != bs &&
2086 2087
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
2088 2089
        int64_t file_pnum;
        int ret2;
2090

2091 2092
        ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
                                    *pnum, &file_pnum, NULL, NULL);
2093 2094 2095 2096
        if (ret2 >= 0) {
            /* Ignore errors.  This is just providing extra information, it
             * is useful but not necessary.
             */
2097 2098 2099 2100 2101 2102 2103
            if (ret2 & BDRV_BLOCK_EOF &&
                (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
                /*
                 * It is valid for the format block driver to read
                 * beyond the end of the underlying file's current
                 * size; such areas read as zero.
                 */
2104 2105 2106 2107 2108 2109 2110 2111 2112
                ret |= BDRV_BLOCK_ZERO;
            } else {
                /* Limit request to the range reported by the protocol driver */
                *pnum = file_pnum;
                ret |= (ret2 & BDRV_BLOCK_ZERO);
            }
        }
    }

2113 2114
out:
    bdrv_dec_in_flight(bs);
2115
    if (ret >= 0 && offset + *pnum == total_size) {
2116 2117
        ret |= BDRV_BLOCK_EOF;
    }
2118 2119 2120 2121
early_out:
    if (file) {
        *file = local_file;
    }
2122 2123 2124
    if (map) {
        *map = local_map;
    }
2125 2126 2127
    return ret;
}

2128 2129 2130 2131 2132 2133 2134 2135
static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
                                                   BlockDriverState *base,
                                                   bool want_zero,
                                                   int64_t offset,
                                                   int64_t bytes,
                                                   int64_t *pnum,
                                                   int64_t *map,
                                                   BlockDriverState **file)
2136 2137
{
    BlockDriverState *p;
2138
    int ret = 0;
2139
    bool first = true;
2140 2141

    assert(bs != base);
2142
    for (p = bs; p != base; p = backing_bs(p)) {
2143 2144
        ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
                                   file);
2145 2146 2147 2148 2149 2150 2151 2152 2153 2154
        if (ret < 0) {
            break;
        }
        if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
            /*
             * Reading beyond the end of the file continues to read
             * zeroes, but we can only widen the result to the
             * unallocated length we learned from an earlier
             * iteration.
             */
2155
            *pnum = bytes;
2156 2157
        }
        if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
2158 2159
            break;
        }
2160 2161 2162
        /* [offset, pnum] unallocated on this layer, which could be only
         * the first part of [offset, bytes].  */
        bytes = MIN(bytes, *pnum);
2163
        first = false;
2164 2165 2166 2167
    }
    return ret;
}

2168
/* Coroutine wrapper for bdrv_block_status_above() */
2169
static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
2170
{
2171
    BdrvCoBlockStatusData *data = opaque;
2172

2173 2174 2175 2176
    data->ret = bdrv_co_block_status_above(data->bs, data->base,
                                           data->want_zero,
                                           data->offset, data->bytes,
                                           data->pnum, data->map, data->file);
2177 2178 2179 2180
    data->done = true;
}

/*
2181
 * Synchronous wrapper around bdrv_co_block_status_above().
2182
 *
2183
 * See bdrv_co_block_status_above() for details.
2184
 */
2185 2186 2187 2188 2189 2190
static int bdrv_common_block_status_above(BlockDriverState *bs,
                                          BlockDriverState *base,
                                          bool want_zero, int64_t offset,
                                          int64_t bytes, int64_t *pnum,
                                          int64_t *map,
                                          BlockDriverState **file)
2191 2192
{
    Coroutine *co;
2193
    BdrvCoBlockStatusData data = {
2194
        .bs = bs,
2195
        .base = base,
2196
        .want_zero = want_zero,
2197 2198 2199 2200
        .offset = offset,
        .bytes = bytes,
        .pnum = pnum,
        .map = map,
2201
        .file = file,
2202 2203 2204 2205 2206
        .done = false,
    };

    if (qemu_in_coroutine()) {
        /* Fast-path if already in coroutine context */
2207
        bdrv_block_status_above_co_entry(&data);
2208
    } else {
2209
        co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data);
2210
        bdrv_coroutine_enter(bs, co);
P
Paolo Bonzini 已提交
2211
        BDRV_POLL_WHILE(bs, !data.done);
2212
    }
2213
    return data.ret;
2214 2215
}

2216 2217 2218
int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
                            int64_t offset, int64_t bytes, int64_t *pnum,
                            int64_t *map, BlockDriverState **file)
2219
{
2220 2221
    return bdrv_common_block_status_above(bs, base, true, offset, bytes,
                                          pnum, map, file);
2222 2223
}

2224 2225
int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
                      int64_t *pnum, int64_t *map, BlockDriverState **file)
2226
{
2227 2228
    return bdrv_block_status_above(bs, backing_bs(bs),
                                   offset, bytes, pnum, map, file);
2229 2230
}

2231 2232
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
                                   int64_t bytes, int64_t *pnum)
2233
{
2234 2235
    int ret;
    int64_t dummy;
2236

2237 2238
    ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
                                         bytes, pnum ? pnum : &dummy, NULL,
2239
                                         NULL);
2240 2241 2242 2243 2244 2245 2246 2247 2248
    if (ret < 0) {
        return ret;
    }
    return !!(ret & BDRV_BLOCK_ALLOCATED);
}

/*
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
 *
2249 2250 2251
 * Return true if (a prefix of) the given range is allocated in any image
 * between BASE and TOP (inclusive).  BASE can be NULL to check if the given
 * offset is allocated in any image of the chain.  Return false otherwise,
2252
 * or negative errno on failure.
2253
 *
2254 2255 2256 2257 2258 2259
 * 'pnum' is set to the number of bytes (including and immediately
 * following the specified offset) that are known to be in the same
 * allocated/unallocated state.  Note that a subsequent call starting
 * at 'offset + *pnum' may return the same allocation status (in other
 * words, the result is not necessarily the maximum possible range);
 * but 'pnum' will only be 0 when end of file is reached.
2260 2261 2262 2263
 *
 */
int bdrv_is_allocated_above(BlockDriverState *top,
                            BlockDriverState *base,
2264
                            int64_t offset, int64_t bytes, int64_t *pnum)
2265 2266
{
    BlockDriverState *intermediate;
2267 2268
    int ret;
    int64_t n = bytes;
2269 2270 2271

    intermediate = top;
    while (intermediate && intermediate != base) {
2272
        int64_t pnum_inter;
2273
        int64_t size_inter;
2274

2275
        ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
2276 2277
        if (ret < 0) {
            return ret;
2278 2279
        }
        if (ret) {
2280
            *pnum = pnum_inter;
2281 2282 2283
            return 1;
        }

2284
        size_inter = bdrv_getlength(intermediate);
2285 2286 2287
        if (size_inter < 0) {
            return size_inter;
        }
2288 2289 2290
        if (n > pnum_inter &&
            (intermediate == top || offset + pnum_inter < size_inter)) {
            n = pnum_inter;
2291 2292
        }

2293
        intermediate = backing_bs(intermediate);
2294 2295 2296 2297 2298 2299
    }

    *pnum = n;
    return 0;
}

2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312
typedef struct BdrvVmstateCo {
    BlockDriverState    *bs;
    QEMUIOVector        *qiov;
    int64_t             pos;
    bool                is_read;
    int                 ret;
} BdrvVmstateCo;

static int coroutine_fn
bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
                   bool is_read)
{
    BlockDriver *drv = bs->drv;
2313 2314 2315
    int ret = -ENOTSUP;

    bdrv_inc_in_flight(bs);
2316 2317

    if (!drv) {
2318
        ret = -ENOMEDIUM;
2319
    } else if (drv->bdrv_load_vmstate) {
2320 2321 2322 2323 2324
        if (is_read) {
            ret = drv->bdrv_load_vmstate(bs, qiov, pos);
        } else {
            ret = drv->bdrv_save_vmstate(bs, qiov, pos);
        }
2325
    } else if (bs->file) {
2326
        ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
2327 2328
    }

2329 2330
    bdrv_dec_in_flight(bs);
    return ret;
2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352
}

static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
{
    BdrvVmstateCo *co = opaque;
    co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
}

static inline int
bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
                bool is_read)
{
    if (qemu_in_coroutine()) {
        return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
    } else {
        BdrvVmstateCo data = {
            .bs         = bs,
            .qiov       = qiov,
            .pos        = pos,
            .is_read    = is_read,
            .ret        = -EINPROGRESS,
        };
2353
        Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
2354

2355
        bdrv_coroutine_enter(bs, co);
2356
        BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
2357 2358 2359 2360
        return data.ret;
    }
}

2361 2362 2363 2364 2365 2366 2367 2368
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
                      int64_t pos, int size)
{
    QEMUIOVector qiov;
    struct iovec iov = {
        .iov_base   = (void *) buf,
        .iov_len    = size,
    };
2369
    int ret;
2370 2371

    qemu_iovec_init_external(&qiov, &iov, 1);
2372 2373 2374 2375 2376 2377 2378

    ret = bdrv_writev_vmstate(bs, &qiov, pos);
    if (ret < 0) {
        return ret;
    }

    return size;
2379 2380 2381 2382
}

int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
{
2383
    return bdrv_rw_vmstate(bs, qiov, pos, false);
2384 2385 2386 2387
}

int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
                      int64_t pos, int size)
2388 2389 2390 2391 2392 2393
{
    QEMUIOVector qiov;
    struct iovec iov = {
        .iov_base   = buf,
        .iov_len    = size,
    };
2394
    int ret;
2395 2396

    qemu_iovec_init_external(&qiov, &iov, 1);
2397 2398 2399 2400 2401 2402
    ret = bdrv_readv_vmstate(bs, &qiov, pos);
    if (ret < 0) {
        return ret;
    }

    return size;
2403 2404 2405
}

int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2406
{
2407
    return bdrv_rw_vmstate(bs, qiov, pos, true);
2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420
}

/**************************************************************/
/* async I/Os */

void bdrv_aio_cancel(BlockAIOCB *acb)
{
    qemu_aio_ref(acb);
    bdrv_aio_cancel_async(acb);
    while (acb->refcnt > 1) {
        if (acb->aiocb_info->get_aio_context) {
            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
        } else if (acb->bs) {
2421 2422 2423 2424 2425
            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
             * assert that we're not using an I/O thread.  Thread-safe
             * code should use bdrv_aio_cancel_async exclusively.
             */
            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446
            aio_poll(bdrv_get_aio_context(acb->bs), true);
        } else {
            abort();
        }
    }
    qemu_aio_unref(acb);
}

/* Async version of aio cancel. The caller is not blocked if the acb implements
 * cancel_async, otherwise we do nothing and let the request normally complete.
 * In either case the completion callback must be called. */
void bdrv_aio_cancel_async(BlockAIOCB *acb)
{
    if (acb->aiocb_info->cancel_async) {
        acb->aiocb_info->cancel_async(acb);
    }
}

/**************************************************************/
/* Coroutine block device emulation */

2447 2448 2449 2450 2451 2452
typedef struct FlushCo {
    BlockDriverState *bs;
    int ret;
} FlushCo;


2453 2454
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
{
2455
    FlushCo *rwco = opaque;
2456 2457 2458 2459 2460 2461

    rwco->ret = bdrv_co_flush(rwco->bs);
}

int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
{
2462 2463 2464 2465
    int current_gen;
    int ret = 0;

    bdrv_inc_in_flight(bs);
2466

2467
    if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2468
        bdrv_is_sg(bs)) {
2469
        goto early_exit;
2470 2471
    }

2472
    qemu_co_mutex_lock(&bs->reqs_lock);
2473
    current_gen = atomic_read(&bs->write_gen);
2474 2475

    /* Wait until any previous flushes are completed */
2476
    while (bs->active_flush_req) {
2477
        qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
2478 2479
    }

2480
    /* Flushes reach this point in nondecreasing current_gen order.  */
2481
    bs->active_flush_req = true;
2482
    qemu_co_mutex_unlock(&bs->reqs_lock);
2483

P
Pavel Dovgalyuk 已提交
2484 2485 2486 2487 2488 2489
    /* Write back all layers by calling one driver function */
    if (bs->drv->bdrv_co_flush) {
        ret = bs->drv->bdrv_co_flush(bs);
        goto out;
    }

2490 2491 2492 2493 2494
    /* Write back cached data to the OS even with cache=unsafe */
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
    if (bs->drv->bdrv_co_flush_to_os) {
        ret = bs->drv->bdrv_co_flush_to_os(bs);
        if (ret < 0) {
F
Fam Zheng 已提交
2495
            goto out;
2496 2497 2498 2499 2500 2501 2502 2503
        }
    }

    /* But don't actually force it to the disk with cache=unsafe */
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
        goto flush_parent;
    }

2504 2505 2506 2507 2508
    /* Check if we really need to flush anything */
    if (bs->flushed_gen == current_gen) {
        goto flush_parent;
    }

2509
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
M
Max Reitz 已提交
2510 2511 2512 2513 2514 2515
    if (!bs->drv) {
        /* bs->drv->bdrv_co_flush() might have ejected the BDS
         * (even in case of apparent success) */
        ret = -ENOMEDIUM;
        goto out;
    }
2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544
    if (bs->drv->bdrv_co_flush_to_disk) {
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
    } else if (bs->drv->bdrv_aio_flush) {
        BlockAIOCB *acb;
        CoroutineIOCompletion co = {
            .coroutine = qemu_coroutine_self(),
        };

        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
        if (acb == NULL) {
            ret = -EIO;
        } else {
            qemu_coroutine_yield();
            ret = co.ret;
        }
    } else {
        /*
         * Some block drivers always operate in either writethrough or unsafe
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
         * know how the server works (because the behaviour is hardcoded or
         * depends on server-side configuration), so we can't ensure that
         * everything is safe on disk. Returning an error doesn't work because
         * that would break guests even if the server operates in writethrough
         * mode.
         *
         * Let's hope the user knows what he's doing.
         */
        ret = 0;
    }
2545

2546
    if (ret < 0) {
F
Fam Zheng 已提交
2547
        goto out;
2548 2549 2550 2551 2552 2553
    }

    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
     * in the case of cache=unsafe, so there are no useless flushes.
     */
flush_parent:
F
Fam Zheng 已提交
2554 2555
    ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
out:
2556
    /* Notify any pending flushes that we have completed */
2557 2558 2559
    if (ret == 0) {
        bs->flushed_gen = current_gen;
    }
2560 2561

    qemu_co_mutex_lock(&bs->reqs_lock);
2562
    bs->active_flush_req = false;
2563 2564
    /* Return value is ignored - it's ok if wait queue is empty */
    qemu_co_queue_next(&bs->flush_queue);
2565
    qemu_co_mutex_unlock(&bs->reqs_lock);
2566

2567
early_exit:
2568
    bdrv_dec_in_flight(bs);
F
Fam Zheng 已提交
2569
    return ret;
2570 2571 2572 2573 2574
}

int bdrv_flush(BlockDriverState *bs)
{
    Coroutine *co;
2575
    FlushCo flush_co = {
2576 2577 2578 2579 2580 2581
        .bs = bs,
        .ret = NOT_DONE,
    };

    if (qemu_in_coroutine()) {
        /* Fast-path if already in coroutine context */
2582
        bdrv_flush_co_entry(&flush_co);
2583
    } else {
2584
        co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2585
        bdrv_coroutine_enter(bs, co);
P
Paolo Bonzini 已提交
2586
        BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
2587 2588
    }

2589
    return flush_co.ret;
2590 2591 2592 2593
}

typedef struct DiscardCo {
    BlockDriverState *bs;
2594
    int64_t offset;
2595
    int bytes;
2596 2597
    int ret;
} DiscardCo;
2598
static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
2599 2600 2601
{
    DiscardCo *rwco = opaque;

2602
    rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes);
2603 2604
}

2605
int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
2606
                                  int bytes)
2607
{
F
Fam Zheng 已提交
2608
    BdrvTrackedRequest req;
2609
    int max_pdiscard, ret;
2610
    int head, tail, align;
2611 2612 2613 2614 2615

    if (!bs->drv) {
        return -ENOMEDIUM;
    }

2616 2617 2618 2619
    if (bdrv_has_readonly_bitmaps(bs)) {
        return -EPERM;
    }

2620
    ret = bdrv_check_byte_request(bs, offset, bytes);
2621 2622 2623
    if (ret < 0) {
        return ret;
    } else if (bs->read_only) {
2624
        return -EPERM;
2625
    }
2626
    assert(!(bs->open_flags & BDRV_O_INACTIVE));
2627 2628 2629 2630 2631 2632

    /* Do nothing if disabled.  */
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
        return 0;
    }

E
Eric Blake 已提交
2633
    if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
2634 2635 2636
        return 0;
    }

2637 2638 2639 2640 2641
    /* Discard is advisory, but some devices track and coalesce
     * unaligned requests, so we must pass everything down rather than
     * round here.  Still, most devices will just silently ignore
     * unaligned requests (by returning -ENOTSUP), so we must fragment
     * the request accordingly.  */
E
Eric Blake 已提交
2642
    align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2643 2644
    assert(align % bs->bl.request_alignment == 0);
    head = offset % align;
2645
    tail = (offset + bytes) % align;
2646

2647
    bdrv_inc_in_flight(bs);
2648
    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
2649

2650 2651 2652 2653 2654
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
    if (ret < 0) {
        goto out;
    }

2655 2656
    max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
                                   align);
2657
    assert(max_pdiscard >= bs->bl.request_alignment);
2658

2659 2660
    while (bytes > 0) {
        int num = bytes;
2661 2662 2663

        if (head) {
            /* Make small requests to get to alignment boundaries. */
2664
            num = MIN(bytes, align - head);
2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683
            if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
                num %= bs->bl.request_alignment;
            }
            head = (head + num) % align;
            assert(num < max_pdiscard);
        } else if (tail) {
            if (num > align) {
                /* Shorten the request to the last aligned cluster.  */
                num -= tail;
            } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
                       tail > bs->bl.request_alignment) {
                tail %= bs->bl.request_alignment;
                num -= tail;
            }
        }
        /* limit request size */
        if (num > max_pdiscard) {
            num = max_pdiscard;
        }
2684

M
Max Reitz 已提交
2685 2686 2687 2688
        if (!bs->drv) {
            ret = -ENOMEDIUM;
            goto out;
        }
2689 2690
        if (bs->drv->bdrv_co_pdiscard) {
            ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
2691 2692 2693 2694 2695 2696
        } else {
            BlockAIOCB *acb;
            CoroutineIOCompletion co = {
                .coroutine = qemu_coroutine_self(),
            };

2697 2698
            acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
                                             bdrv_co_io_em_complete, &co);
2699
            if (acb == NULL) {
F
Fam Zheng 已提交
2700 2701
                ret = -EIO;
                goto out;
2702 2703 2704 2705 2706 2707
            } else {
                qemu_coroutine_yield();
                ret = co.ret;
            }
        }
        if (ret && ret != -ENOTSUP) {
F
Fam Zheng 已提交
2708
            goto out;
2709 2710
        }

2711
        offset += num;
2712
        bytes -= num;
2713
    }
F
Fam Zheng 已提交
2714 2715
    ret = 0;
out:
2716
    atomic_inc(&bs->write_gen);
2717
    bdrv_set_dirty(bs, req.offset, req.bytes);
F
Fam Zheng 已提交
2718
    tracked_request_end(&req);
2719
    bdrv_dec_in_flight(bs);
F
Fam Zheng 已提交
2720
    return ret;
2721 2722
}

2723
int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
2724 2725 2726 2727
{
    Coroutine *co;
    DiscardCo rwco = {
        .bs = bs,
2728
        .offset = offset,
2729
        .bytes = bytes,
2730 2731 2732 2733 2734
        .ret = NOT_DONE,
    };

    if (qemu_in_coroutine()) {
        /* Fast-path if already in coroutine context */
2735
        bdrv_pdiscard_co_entry(&rwco);
2736
    } else {
2737
        co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
2738
        bdrv_coroutine_enter(bs, co);
P
Paolo Bonzini 已提交
2739
        BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
2740 2741 2742 2743 2744
    }

    return rwco.ret;
}

2745
int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
2746 2747
{
    BlockDriver *drv = bs->drv;
2748 2749 2750 2751
    CoroutineIOCompletion co = {
        .coroutine = qemu_coroutine_self(),
    };
    BlockAIOCB *acb;
2752

2753
    bdrv_inc_in_flight(bs);
2754
    if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
2755 2756 2757 2758
        co.ret = -ENOTSUP;
        goto out;
    }

2759 2760 2761 2762 2763 2764 2765 2766 2767
    if (drv->bdrv_co_ioctl) {
        co.ret = drv->bdrv_co_ioctl(bs, req, buf);
    } else {
        acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
        if (!acb) {
            co.ret = -ENOTSUP;
            goto out;
        }
        qemu_coroutine_yield();
2768 2769
    }
out:
2770
    bdrv_dec_in_flight(bs);
2771 2772 2773
    return co.ret;
}

2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813
void *qemu_blockalign(BlockDriverState *bs, size_t size)
{
    return qemu_memalign(bdrv_opt_mem_align(bs), size);
}

void *qemu_blockalign0(BlockDriverState *bs, size_t size)
{
    return memset(qemu_blockalign(bs, size), 0, size);
}

void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
{
    size_t align = bdrv_opt_mem_align(bs);

    /* Ensure that NULL is never returned on success */
    assert(align > 0);
    if (size == 0) {
        size = align;
    }

    return qemu_try_memalign(align, size);
}

void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
{
    void *mem = qemu_try_blockalign(bs, size);

    if (mem) {
        memset(mem, 0, size);
    }

    return mem;
}

/*
 * Check if all memory in this vector is sector aligned.
 */
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
{
    int i;
2814
    size_t alignment = bdrv_min_mem_align(bs);
2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835

    for (i = 0; i < qiov->niov; i++) {
        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
            return false;
        }
        if (qiov->iov[i].iov_len % alignment) {
            return false;
        }
    }

    return true;
}

void bdrv_add_before_write_notifier(BlockDriverState *bs,
                                    NotifierWithReturn *notifier)
{
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
}

void bdrv_io_plug(BlockDriverState *bs)
{
2836 2837 2838 2839 2840 2841
    BdrvChild *child;

    QLIST_FOREACH(child, &bs->children, next) {
        bdrv_io_plug(child->bs);
    }

2842
    if (atomic_fetch_inc(&bs->io_plugged) == 0) {
2843 2844 2845 2846
        BlockDriver *drv = bs->drv;
        if (drv && drv->bdrv_io_plug) {
            drv->bdrv_io_plug(bs);
        }
2847 2848 2849 2850 2851
    }
}

void bdrv_io_unplug(BlockDriverState *bs)
{
2852 2853 2854
    BdrvChild *child;

    assert(bs->io_plugged);
2855
    if (atomic_fetch_dec(&bs->io_plugged) == 1) {
2856 2857 2858 2859 2860 2861 2862 2863
        BlockDriver *drv = bs->drv;
        if (drv && drv->bdrv_io_unplug) {
            drv->bdrv_io_unplug(bs);
        }
    }

    QLIST_FOREACH(child, &bs->children, next) {
        bdrv_io_unplug(child->bs);
2864 2865
    }
}
F
Fam Zheng 已提交
2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889

void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
{
    BdrvChild *child;

    if (bs->drv && bs->drv->bdrv_register_buf) {
        bs->drv->bdrv_register_buf(bs, host, size);
    }
    QLIST_FOREACH(child, &bs->children, next) {
        bdrv_register_buf(child->bs, host, size);
    }
}

void bdrv_unregister_buf(BlockDriverState *bs, void *host)
{
    BdrvChild *child;

    if (bs->drv && bs->drv->bdrv_unregister_buf) {
        bs->drv->bdrv_unregister_buf(bs, host);
    }
    QLIST_FOREACH(child, &bs->children, next) {
        bdrv_unregister_buf(child->bs, host);
    }
}
2890 2891 2892 2893 2894 2895 2896 2897 2898

static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src,
                                                    uint64_t src_offset,
                                                    BdrvChild *dst,
                                                    uint64_t dst_offset,
                                                    uint64_t bytes,
                                                    BdrvRequestFlags flags,
                                                    bool recurse_src)
{
2899 2900 2901
    BdrvTrackedRequest src_req, dst_req;
    BlockDriverState *src_bs = src->bs;
    BlockDriverState *dst_bs = dst->bs;
2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924
    int ret;

    if (!src || !dst || !src->bs || !dst->bs) {
        return -ENOMEDIUM;
    }
    ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
    if (ret) {
        return ret;
    }

    ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
    if (ret) {
        return ret;
    }
    if (flags & BDRV_REQ_ZERO_WRITE) {
        return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, flags);
    }

    if (!src->bs->drv->bdrv_co_copy_range_from
        || !dst->bs->drv->bdrv_co_copy_range_to
        || src->bs->encrypted || dst->bs->encrypted) {
        return -ENOTSUP;
    }
2925 2926 2927 2928 2929 2930 2931 2932 2933
    bdrv_inc_in_flight(src_bs);
    bdrv_inc_in_flight(dst_bs);
    tracked_request_begin(&src_req, src_bs, src_offset,
                          bytes, BDRV_TRACKED_READ);
    tracked_request_begin(&dst_req, dst_bs, dst_offset,
                          bytes, BDRV_TRACKED_WRITE);

    wait_serialising_requests(&src_req);
    wait_serialising_requests(&dst_req);
2934
    if (recurse_src) {
2935 2936 2937 2938
        ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
                                                    src, src_offset,
                                                    dst, dst_offset,
                                                    bytes, flags);
2939
    } else {
2940 2941 2942 2943
        ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
                                                  src, src_offset,
                                                  dst, dst_offset,
                                                  bytes, flags);
2944
    }
2945 2946 2947 2948 2949
    tracked_request_end(&src_req);
    tracked_request_end(&dst_req);
    bdrv_dec_in_flight(src_bs);
    bdrv_dec_in_flight(dst_bs);
    return ret;
2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979
}

/* Copy range from @src to @dst.
 *
 * See the comment of bdrv_co_copy_range for the parameter and return value
 * semantics. */
int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
                                         BdrvChild *dst, uint64_t dst_offset,
                                         uint64_t bytes, BdrvRequestFlags flags)
{
    return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
                                       bytes, flags, true);
}

/* Copy range from @src to @dst.
 *
 * See the comment of bdrv_co_copy_range for the parameter and return value
 * semantics. */
int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
                                       BdrvChild *dst, uint64_t dst_offset,
                                       uint64_t bytes, BdrvRequestFlags flags)
{
    return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
                                       bytes, flags, false);
}

int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
                                    BdrvChild *dst, uint64_t dst_offset,
                                    uint64_t bytes, BdrvRequestFlags flags)
{
2980 2981 2982
    return bdrv_co_copy_range_from(src, src_offset,
                                   dst, dst_offset,
                                   bytes, flags);
2983
}
2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002

static void bdrv_parent_cb_resize(BlockDriverState *bs)
{
    BdrvChild *c;
    QLIST_FOREACH(c, &bs->parents, next_parent) {
        if (c->role->resize) {
            c->role->resize(c);
        }
    }
}

/**
 * Truncate file to 'offset' bytes (needed only for file protocols)
 */
int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
                                  PreallocMode prealloc, Error **errp)
{
    BlockDriverState *bs = child->bs;
    BlockDriver *drv = bs->drv;
3003 3004
    BdrvTrackedRequest req;
    int64_t old_size, new_bytes;
3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018
    int ret;

    assert(child->perm & BLK_PERM_RESIZE);

    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
    if (!drv) {
        error_setg(errp, "No medium inserted");
        return -ENOMEDIUM;
    }
    if (offset < 0) {
        error_setg(errp, "Image size cannot be negative");
        return -EINVAL;
    }

3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030
    old_size = bdrv_getlength(bs);
    if (old_size < 0) {
        error_setg_errno(errp, -old_size, "Failed to get old image size");
        return old_size;
    }

    if (offset > old_size) {
        new_bytes = offset - old_size;
    } else {
        new_bytes = 0;
    }

3031
    bdrv_inc_in_flight(bs);
3032 3033 3034 3035 3036 3037 3038 3039 3040
    tracked_request_begin(&req, bs, offset, new_bytes, BDRV_TRACKED_TRUNCATE);

    /* If we are growing the image and potentially using preallocation for the
     * new area, we need to make sure that no write requests are made to it
     * concurrently or they might be overwritten by preallocation. */
    if (new_bytes) {
        mark_request_serialising(&req, 1);
        wait_serialising_requests(&req);
    }
3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073

    if (!drv->bdrv_co_truncate) {
        if (bs->file && drv->is_filter) {
            ret = bdrv_co_truncate(bs->file, offset, prealloc, errp);
            goto out;
        }
        error_setg(errp, "Image format driver does not support resize");
        ret = -ENOTSUP;
        goto out;
    }
    if (bs->read_only) {
        error_setg(errp, "Image is read-only");
        ret = -EACCES;
        goto out;
    }

    assert(!(bs->open_flags & BDRV_O_INACTIVE));

    ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp);
    if (ret < 0) {
        goto out;
    }
    ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
    } else {
        offset = bs->total_sectors * BDRV_SECTOR_SIZE;
    }
    bdrv_dirty_bitmap_truncate(bs, offset);
    bdrv_parent_cb_resize(bs);
    atomic_inc(&bs->write_gen);

out:
3074
    tracked_request_end(&req);
3075
    bdrv_dec_in_flight(bs);
3076

3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117
    return ret;
}

typedef struct TruncateCo {
    BdrvChild *child;
    int64_t offset;
    PreallocMode prealloc;
    Error **errp;
    int ret;
} TruncateCo;

static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
{
    TruncateCo *tco = opaque;
    tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->prealloc,
                                tco->errp);
}

int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
                  Error **errp)
{
    Coroutine *co;
    TruncateCo tco = {
        .child      = child,
        .offset     = offset,
        .prealloc   = prealloc,
        .errp       = errp,
        .ret        = NOT_DONE,
    };

    if (qemu_in_coroutine()) {
        /* Fast-path if already in coroutine context */
        bdrv_truncate_co_entry(&tco);
    } else {
        co = qemu_coroutine_create(bdrv_truncate_co_entry, &tco);
        qemu_coroutine_enter(co);
        BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE);
    }

    return tco.ret;
}