mirror.c 30.8 KB
Newer Older
P
Paolo Bonzini 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Image mirroring
 *
 * Copyright Red Hat, Inc. 2012
 *
 * Authors:
 *  Paolo Bonzini  <pbonzini@redhat.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
14
#include "qemu/osdep.h"
P
Paolo Bonzini 已提交
15
#include "trace.h"
16 17
#include "block/blockjob.h"
#include "block/block_int.h"
18
#include "sysemu/block-backend.h"
19
#include "qapi/error.h"
20
#include "qapi/qmp/qerror.h"
P
Paolo Bonzini 已提交
21
#include "qemu/ratelimit.h"
22
#include "qemu/bitmap.h"
23
#include "qemu/error-report.h"
P
Paolo Bonzini 已提交
24

25 26
#define SLICE_TIME    100000000ULL /* ns */
#define MAX_IN_FLIGHT 16
W
Wen Congyang 已提交
27
#define DEFAULT_MIRROR_BUF_SIZE   (10 << 20)
28 29 30 31 32 33 34

/* The mirroring buffer is a list of granularity-sized chunks.
 * Free chunks are organized in a list.
 */
typedef struct MirrorBuffer {
    QSIMPLEQ_ENTRY(MirrorBuffer) next;
} MirrorBuffer;
P
Paolo Bonzini 已提交
35 36 37 38 39

typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
    BlockDriverState *target;
F
Fam Zheng 已提交
40
    BlockDriverState *base;
41 42 43 44 45 46
    /* The name of the graph node to replace */
    char *replaces;
    /* The BDS to replace */
    BlockDriverState *to_replace;
    /* Used to block operations on the drive-mirror-replace target */
    Error *replace_blocker;
F
Fam Zheng 已提交
47
    bool is_none_mode;
48
    BlockdevOnError on_source_error, on_target_error;
P
Paolo Bonzini 已提交
49 50
    bool synced;
    bool should_complete;
51
    int64_t granularity;
52
    size_t buf_size;
M
Max Reitz 已提交
53
    int64_t bdev_length;
54
    unsigned long *cow_bitmap;
F
Fam Zheng 已提交
55
    BdrvDirtyBitmap *dirty_bitmap;
56
    HBitmapIter hbi;
P
Paolo Bonzini 已提交
57
    uint8_t *buf;
58 59
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;
60

61
    unsigned long *in_flight_bitmap;
62
    int in_flight;
M
Max Reitz 已提交
63
    int sectors_in_flight;
64
    int ret;
65
    bool unmap;
K
Kevin Wolf 已提交
66
    bool waiting_for_io;
F
Fam Zheng 已提交
67 68
    int target_cluster_sectors;
    int max_iov;
P
Paolo Bonzini 已提交
69 70
} MirrorBlockJob;

71 72 73 74 75 76 77
typedef struct MirrorOp {
    MirrorBlockJob *s;
    QEMUIOVector qiov;
    int64_t sector_num;
    int nb_sectors;
} MirrorOp;

78 79 80 81 82 83 84 85 86 87 88 89 90
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                            int error)
{
    s->synced = false;
    if (read) {
        return block_job_error_action(&s->common, s->common.bs,
                                      s->on_source_error, true, error);
    } else {
        return block_job_error_action(&s->common, s->target,
                                      s->on_target_error, false, error);
    }
}

91 92 93
static void mirror_iteration_done(MirrorOp *op, int ret)
{
    MirrorBlockJob *s = op->s;
94
    struct iovec *iov;
95
    int64_t chunk_num;
96
    int i, nb_chunks, sectors_per_chunk;
97 98 99 100

    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
M
Max Reitz 已提交
101
    s->sectors_in_flight -= op->nb_sectors;
102 103 104 105 106 107 108
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
        s->buf_free_count++;
    }

109 110 111
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    chunk_num = op->sector_num / sectors_per_chunk;
    nb_chunks = op->nb_sectors / sectors_per_chunk;
112
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
M
Max Reitz 已提交
113 114 115 116 117
    if (ret >= 0) {
        if (s->cow_bitmap) {
            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
        }
        s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
118 119
    }

Z
Zhang Min 已提交
120
    qemu_iovec_destroy(&op->qiov);
121
    g_free(op);
122

K
Kevin Wolf 已提交
123
    if (s->waiting_for_io) {
124 125
        qemu_coroutine_enter(s->common.co, NULL);
    }
126 127 128 129 130 131 132 133 134
}

static void mirror_write_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
    if (ret < 0) {
        BlockErrorAction action;

135
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
136
        action = mirror_error_action(s, false, -ret);
W
Wenchao Xia 已提交
137
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
138 139 140 141 142 143 144 145 146 147 148 149 150
            s->ret = ret;
        }
    }
    mirror_iteration_done(op, ret);
}

static void mirror_read_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
    if (ret < 0) {
        BlockErrorAction action;

151
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
152
        action = mirror_error_action(s, true, -ret);
W
Wenchao Xia 已提交
153
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
154 155 156 157 158 159 160 161 162 163
            s->ret = ret;
        }

        mirror_iteration_done(op, ret);
        return;
    }
    bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors,
                    mirror_write_complete, op);
}

F
Fam Zheng 已提交
164 165 166 167 168
/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
 * return the offset of the adjusted tail sector against original. */
static int mirror_cow_align(MirrorBlockJob *s,
                            int64_t *sector_num,
                            int *nb_sectors)
P
Paolo Bonzini 已提交
169
{
F
Fam Zheng 已提交
170 171 172 173 174 175 176 177 178 179 180 181 182 183
    bool need_cow;
    int ret = 0;
    int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS;
    int64_t align_sector_num = *sector_num;
    int align_nb_sectors = *nb_sectors;
    int max_sectors = chunk_sectors * s->max_iov;

    need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap);
    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                          s->cow_bitmap);
    if (need_cow) {
        bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors,
                               &align_sector_num, &align_nb_sectors);
    }
184

F
Fam Zheng 已提交
185 186 187 188 189 190
    if (align_nb_sectors > max_sectors) {
        align_nb_sectors = max_sectors;
        if (need_cow) {
            align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors,
                                               s->target_cluster_sectors);
        }
191 192
    }

F
Fam Zheng 已提交
193 194 195 196 197 198 199
    ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
    *sector_num = align_sector_num;
    *nb_sectors = align_nb_sectors;
    assert(ret >= 0);
    return ret;
}

F
Fam Zheng 已提交
200 201 202 203 204 205 206 207
static inline void mirror_wait_for_io(MirrorBlockJob *s)
{
    assert(!s->waiting_for_io);
    s->waiting_for_io = true;
    qemu_coroutine_yield();
    s->waiting_for_io = false;
}

F
Fam Zheng 已提交
208 209 210 211 212 213 214 215 216 217 218 219 220
/* Submit async read while handling COW.
 * Returns: nb_sectors if no alignment is necessary, or
 *          (new_end - sector_num) if tail is rounded up or down due to
 *          alignment or buffer limit.
 */
static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                          int nb_sectors)
{
    BlockDriverState *source = s->common.bs;
    int sectors_per_chunk, nb_chunks;
    int ret = nb_sectors;
    MirrorOp *op;

221
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
222

F
Fam Zheng 已提交
223 224 225
    /* We can only handle as much as buf_size at a time. */
    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
    assert(nb_sectors);
226

F
Fam Zheng 已提交
227 228 229 230 231 232 233 234 235 236 237 238
    if (s->cow_bitmap) {
        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
    }
    assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size);
    /* The sector range must meet granularity because:
     * 1) Caller passes in aligned values;
     * 2) mirror_cow_align is used only when target cluster is larger. */
    assert(!(nb_sectors % sectors_per_chunk));
    assert(!(sector_num % sectors_per_chunk));
    nb_chunks = nb_sectors / sectors_per_chunk;

    while (s->buf_free_count < nb_chunks) {
239
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
F
Fam Zheng 已提交
240
        mirror_wait_for_io(s);
241 242
    }

243
    /* Allocate a MirrorOp that is used as an AIO callback.  */
244
    op = g_new(MirrorOp, 1);
245 246 247
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;
248 249 250 251 252 253 254

    /* Now make a QEMUIOVector taking enough granularity-sized chunks
     * from s->buf_free.
     */
    qemu_iovec_init(&op->qiov, nb_chunks);
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
F
Fam Zheng 已提交
255
        size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size;
256

257 258
        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
259
        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
260
    }
261

P
Paolo Bonzini 已提交
262
    /* Copy the dirty cluster.  */
263
    s->in_flight++;
M
Max Reitz 已提交
264
    s->sectors_in_flight += nb_sectors;
265
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
266

F
Fam Zheng 已提交
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
                   mirror_read_complete, op);
    return ret;
}

static void mirror_do_zero_or_discard(MirrorBlockJob *s,
                                      int64_t sector_num,
                                      int nb_sectors,
                                      bool is_discard)
{
    MirrorOp *op;

    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
     * so the freeing in mirror_iteration_done is nop. */
    op = g_new0(MirrorOp, 1);
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;

    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    if (is_discard) {
        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
                         mirror_write_complete, op);
    } else {
292 293 294
        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
F
Fam Zheng 已提交
295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
    }
}

static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
{
    BlockDriverState *source = s->common.bs;
    int64_t sector_num;
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
    int nb_chunks = 1;
    int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
    int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;

    sector_num = hbitmap_iter_next(&s->hbi);
    if (sector_num < 0) {
        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
        sector_num = hbitmap_iter_next(&s->hbi);
        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
        assert(sector_num >= 0);
    }

    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
        int64_t hbitmap_next;
        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
        int64_t next_chunk = next_sector / sectors_per_chunk;
        if (next_sector >= end ||
            !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
            break;
        }
        if (test_bit(next_chunk, s->in_flight_bitmap)) {
            if (nb_chunks > 0) {
                break;
            }
            trace_mirror_yield_in_flight(s, next_sector, s->in_flight);
F
Fam Zheng 已提交
331
            mirror_wait_for_io(s);
F
Fam Zheng 已提交
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
            /* Now retry.  */
        } else {
            hbitmap_next = hbitmap_iter_next(&s->hbi);
            assert(hbitmap_next == next_sector);
            nb_chunks++;
        }
    }

    /* Clear dirty bits before querying the block status, because
     * calling bdrv_get_block_status_above could yield - if some blocks are
     * marked dirty in this window, we need to know.
     */
    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num,
                            nb_chunks * sectors_per_chunk);
    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
    while (nb_chunks > 0 && sector_num < end) {
        int ret;
        int io_sectors;
        BlockDriverState *file;
        enum MirrorMethod {
            MIRROR_METHOD_COPY,
            MIRROR_METHOD_ZERO,
            MIRROR_METHOD_DISCARD
        } mirror_method = MIRROR_METHOD_COPY;

        assert(!(sector_num % sectors_per_chunk));
        ret = bdrv_get_block_status_above(source, NULL, sector_num,
                                          nb_chunks * sectors_per_chunk,
                                          &io_sectors, &file);
        if (ret < 0) {
            io_sectors = nb_chunks * sectors_per_chunk;
        }

        io_sectors -= io_sectors % sectors_per_chunk;
        if (io_sectors < sectors_per_chunk) {
            io_sectors = sectors_per_chunk;
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_sector_num;
            int target_nb_sectors;
            bdrv_round_to_clusters(s->target, sector_num, io_sectors,
                                   &target_sector_num, &target_nb_sectors);
            if (target_sector_num == sector_num &&
                target_nb_sectors == io_sectors) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
                                    MIRROR_METHOD_ZERO :
                                    MIRROR_METHOD_DISCARD;
            }
        }

        switch (mirror_method) {
        case MIRROR_METHOD_COPY:
            io_sectors = mirror_do_read(s, sector_num, io_sectors);
            break;
        case MIRROR_METHOD_ZERO:
            mirror_do_zero_or_discard(s, sector_num, io_sectors, false);
            break;
        case MIRROR_METHOD_DISCARD:
            mirror_do_zero_or_discard(s, sector_num, io_sectors, true);
            break;
        default:
            abort();
        }
        assert(io_sectors);
        sector_num += io_sectors;
        nb_chunks -= io_sectors / sectors_per_chunk;
        delay_ns += ratelimit_calculate_delay(&s->limit, io_sectors);
398
    }
399
    return delay_ns;
400
}
401

402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
static void mirror_free_init(MirrorBlockJob *s)
{
    int granularity = s->granularity;
    size_t buf_size = s->buf_size;
    uint8_t *buf = s->buf;

    assert(s->buf_free_count == 0);
    QSIMPLEQ_INIT(&s->buf_free);
    while (buf_size != 0) {
        MirrorBuffer *cur = (MirrorBuffer *)buf;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
        s->buf_free_count++;
        buf_size -= granularity;
        buf += granularity;
    }
}

419 420 421
static void mirror_drain(MirrorBlockJob *s)
{
    while (s->in_flight > 0) {
F
Fam Zheng 已提交
422
        mirror_wait_for_io(s);
423
    }
P
Paolo Bonzini 已提交
424 425
}

426 427 428 429 430 431 432 433 434
typedef struct {
    int ret;
} MirrorExitData;

static void mirror_exit(BlockJob *job, void *opaque)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
435 436 437 438 439
    BlockDriverState *src = s->common.bs;

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
    bdrv_ref(src);
440 441 442 443 444 445 446 447 448 449 450

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);
    }

    if (s->should_complete && data->ret == 0) {
        BlockDriverState *to_replace = s->common.bs;
        if (s->to_replace) {
            to_replace = s->to_replace;
        }
451 452 453 454 455 456 457 458 459

        /* This was checked in mirror_start_job(), but meanwhile one of the
         * nodes could have been newly attached to a BlockBackend. */
        if (to_replace->blk && s->target->blk) {
            error_report("block job: Can't create node with two BlockBackends");
            data->ret = -EINVAL;
            goto out;
        }

460 461 462
        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
        }
463
        bdrv_replace_in_backing_chain(to_replace, s->target);
464
    }
465 466

out:
467 468 469 470 471 472 473 474 475
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
        bdrv_unref(s->to_replace);
    }
    if (replace_aio_context) {
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
476
    bdrv_op_unblock_all(s->target, s->common.blocker);
477 478 479
    bdrv_unref(s->target);
    block_job_completed(&s->common, data->ret);
    g_free(data);
480
    bdrv_drained_end(src);
481
    bdrv_unref(src);
482 483
}

P
Paolo Bonzini 已提交
484 485 486
static void coroutine_fn mirror_run(void *opaque)
{
    MirrorBlockJob *s = opaque;
487
    MirrorExitData *data;
P
Paolo Bonzini 已提交
488
    BlockDriverState *bs = s->common.bs;
489
    int64_t sector_num, end, length;
490
    uint64_t last_pause_ns;
491
    BlockDriverInfo bdi;
492 493
    char backing_filename[2]; /* we only need 2 characters because we are only
                                 checking for a NULL string */
P
Paolo Bonzini 已提交
494 495
    int ret = 0;
    int n;
F
Fam Zheng 已提交
496
    int target_cluster_size = BDRV_SECTOR_SIZE;
P
Paolo Bonzini 已提交
497 498 499 500 501

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
    }

M
Max Reitz 已提交
502 503 504
    s->bdev_length = bdrv_getlength(bs);
    if (s->bdev_length < 0) {
        ret = s->bdev_length;
505
        goto immediate_exit;
M
Max Reitz 已提交
506
    } else if (s->bdev_length == 0) {
507 508 509 510 511 512 513 514
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
        while (!block_job_is_cancelled(&s->common) && !s->should_complete) {
            block_job_yield(&s->common);
        }
        s->common.cancelled = false;
        goto immediate_exit;
P
Paolo Bonzini 已提交
515 516
    }

M
Max Reitz 已提交
517
    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
518 519
    s->in_flight_bitmap = bitmap_new(length);

520 521 522 523 524 525
    /* If we have no backing file yet in the destination, we cannot let
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
    bdrv_get_backing_filename(s->target, backing_filename,
                              sizeof(backing_filename));
F
Fam Zheng 已提交
526 527 528 529 530 531 532
    if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) {
        target_cluster_size = bdi.cluster_size;
    }
    if (backing_filename[0] && !s->target->backing
        && s->granularity < target_cluster_size) {
        s->buf_size = MAX(s->buf_size, target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
533
    }
F
Fam Zheng 已提交
534 535
    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
    s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov);
536

M
Max Reitz 已提交
537
    end = s->bdev_length / BDRV_SECTOR_SIZE;
538 539 540 541 542 543
    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
        ret = -ENOMEM;
        goto immediate_exit;
    }

544
    mirror_free_init(s);
P
Paolo Bonzini 已提交
545

546
    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
F
Fam Zheng 已提交
547
    if (!s->is_none_mode) {
P
Paolo Bonzini 已提交
548
        /* First part, loop on the sectors and initialize the dirty bitmap.  */
F
Fam Zheng 已提交
549
        BlockDriverState *base = s->base;
550 551
        bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(s->target);

P
Paolo Bonzini 已提交
552
        for (sector_num = 0; sector_num < end; ) {
553 554 555
            /* Just to make sure we are not exceeding int limit. */
            int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
                                 end - sector_num);
556 557 558 559 560 561 562 563 564 565 566
            int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);

            if (now - last_pause_ns > SLICE_TIME) {
                last_pause_ns = now;
                block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
            }

            if (block_job_is_cancelled(&s->common)) {
                goto immediate_exit;
            }

567
            ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
P
Paolo Bonzini 已提交
568 569 570 571 572 573

            if (ret < 0) {
                goto immediate_exit;
            }

            assert(n > 0);
574
            if (ret == 1 || mark_all_dirty) {
575
                bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
P
Paolo Bonzini 已提交
576
            }
577
            sector_num += n;
P
Paolo Bonzini 已提交
578 579 580
        }
    }

581
    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
P
Paolo Bonzini 已提交
582
    for (;;) {
583
        uint64_t delay_ns = 0;
P
Paolo Bonzini 已提交
584 585 586
        int64_t cnt;
        bool should_complete;

587 588 589 590 591
        if (s->ret < 0) {
            ret = s->ret;
            goto immediate_exit;
        }

592
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
M
Max Reitz 已提交
593 594 595 596 597 598
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
         * s->sectors_in_flight is the number of sectors currently being
         * processed; together those are the current total operation length */
        s->common.len = s->common.offset +
                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
599 600

        /* Note that even when no rate limit is applied we need to yield
601
         * periodically with no pending I/O so that bdrv_drain_all() returns.
602 603 604
         * We do so every SLICE_TIME nanoseconds, or when there is an error,
         * or when the source is clean, whichever comes first.
         */
605
        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME &&
606
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
607 608 609
            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                (cnt == 0 && s->in_flight > 0)) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
F
Fam Zheng 已提交
610
                mirror_wait_for_io(s);
611 612
                continue;
            } else if (cnt != 0) {
613
                delay_ns = mirror_iteration(s);
P
Paolo Bonzini 已提交
614 615 616 617
            }
        }

        should_complete = false;
618
        if (s->in_flight == 0 && cnt == 0) {
P
Paolo Bonzini 已提交
619 620 621
            trace_mirror_before_flush(s);
            ret = bdrv_flush(s->target);
            if (ret < 0) {
W
Wenchao Xia 已提交
622 623
                if (mirror_error_action(s, false, -ret) ==
                    BLOCK_ERROR_ACTION_REPORT) {
624 625 626 627 628 629 630 631 632
                    goto immediate_exit;
                }
            } else {
                /* We're out of the streaming phase.  From now on, if the job
                 * is cancelled we will actually complete all pending I/O and
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
                if (!s->synced) {
633
                    block_job_event_ready(&s->common);
634 635 636 637 638
                    s->synced = true;
                }

                should_complete = s->should_complete ||
                    block_job_is_cancelled(&s->common);
639
                cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
640
            }
P
Paolo Bonzini 已提交
641 642 643 644 645 646 647 648 649 650 651 652
        }

        if (cnt == 0 && should_complete) {
            /* The dirty bitmap is not updated while operations are pending.
             * If we're about to exit, wait for pending operations before
             * calling bdrv_get_dirty_count(bs), or we may exit while the
             * source has dirty data to copy!
             *
             * Note that I/O can be submitted by the guest while
             * mirror_populate runs.
             */
            trace_mirror_before_drain(s, cnt);
653
            bdrv_drain(bs);
654
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
655 656 657
        }

        ret = 0;
658
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
P
Paolo Bonzini 已提交
659
        if (!s->synced) {
660
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
661 662 663 664
            if (block_job_is_cancelled(&s->common)) {
                break;
            }
        } else if (!should_complete) {
665
            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
666
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
667 668 669 670 671 672 673 674
        } else if (cnt == 0) {
            /* The two disks are in sync.  Exit and report successful
             * completion.
             */
            assert(QLIST_EMPTY(&bs->tracked_requests));
            s->common.cancelled = false;
            break;
        }
675
        last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
P
Paolo Bonzini 已提交
676 677 678
    }

immediate_exit:
679 680 681 682 683 684 685 686 687 688
    if (s->in_flight > 0) {
        /* We get here only if something went wrong.  Either the job failed,
         * or it was cancelled prematurely so that we do not guarantee that
         * the target is a copy of the source.
         */
        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
        mirror_drain(s);
    }

    assert(s->in_flight == 0);
689
    qemu_vfree(s->buf);
690
    g_free(s->cow_bitmap);
691
    g_free(s->in_flight_bitmap);
F
Fam Zheng 已提交
692
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
693 694 695
    if (s->target->blk) {
        blk_iostatus_disable(s->target->blk);
    }
696 697 698

    data = g_malloc(sizeof(*data));
    data->ret = ret;
699 700 701
    /* Before we switch to target in mirror_exit, make sure data doesn't
     * change. */
    bdrv_drained_begin(s->common.bs);
702
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
P
Paolo Bonzini 已提交
703 704 705 706 707 708 709
}

static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    if (speed < 0) {
710
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
P
Paolo Bonzini 已提交
711 712 713 714 715
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

716 717 718 719
static void mirror_iostatus_reset(BlockJob *job)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

720 721 722
    if (s->target->blk) {
        blk_iostatus_reset(s->target->blk);
    }
723 724
}

P
Paolo Bonzini 已提交
725 726 727
static void mirror_complete(BlockJob *job, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
728
    Error *local_err = NULL;
P
Paolo Bonzini 已提交
729 730
    int ret;

731
    ret = bdrv_open_backing_file(s->target, NULL, "backing", &local_err);
P
Paolo Bonzini 已提交
732
    if (ret < 0) {
733
        error_propagate(errp, local_err);
P
Paolo Bonzini 已提交
734 735 736
        return;
    }
    if (!s->synced) {
737
        error_setg(errp, QERR_BLOCK_JOB_NOT_READY, job->id);
P
Paolo Bonzini 已提交
738 739 740
        return;
    }

741 742
    /* check the target bs is not blocked and block all operations on it */
    if (s->replaces) {
743 744
        AioContext *replace_aio_context;

745
        s->to_replace = bdrv_find_node(s->replaces);
746
        if (!s->to_replace) {
747
            error_setg(errp, "Node name '%s' not found", s->replaces);
748 749 750
            return;
        }

751 752 753
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);

754 755 756 757
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
758 759

        aio_context_release(replace_aio_context);
760 761
    }

P
Paolo Bonzini 已提交
762
    s->should_complete = true;
F
Fam Zheng 已提交
763
    block_job_enter(&s->common);
P
Paolo Bonzini 已提交
764 765
}

766
static const BlockJobDriver mirror_job_driver = {
P
Paolo Bonzini 已提交
767
    .instance_size = sizeof(MirrorBlockJob),
F
Fam Zheng 已提交
768
    .job_type      = BLOCK_JOB_TYPE_MIRROR,
P
Paolo Bonzini 已提交
769
    .set_speed     = mirror_set_speed,
770
    .iostatus_reset= mirror_iostatus_reset,
P
Paolo Bonzini 已提交
771
    .complete      = mirror_complete,
P
Paolo Bonzini 已提交
772 773
};

F
Fam Zheng 已提交
774 775 776 777 778 779 780 781 782 783
static const BlockJobDriver commit_active_job_driver = {
    .instance_size = sizeof(MirrorBlockJob),
    .job_type      = BLOCK_JOB_TYPE_COMMIT,
    .set_speed     = mirror_set_speed,
    .iostatus_reset
                   = mirror_iostatus_reset,
    .complete      = mirror_complete,
};

static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
784
                             const char *replaces,
785
                             int64_t speed, uint32_t granularity,
786 787 788
                             int64_t buf_size,
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
789
                             bool unmap,
790
                             BlockCompletionFunc *cb,
791 792 793
                             void *opaque, Error **errp,
                             const BlockJobDriver *driver,
                             bool is_none_mode, BlockDriverState *base)
P
Paolo Bonzini 已提交
794 795
{
    MirrorBlockJob *s;
796
    BlockDriverState *replaced_bs;
P
Paolo Bonzini 已提交
797

798
    if (granularity == 0) {
799
        granularity = bdrv_get_default_bitmap_granularity(target);
800 801 802 803
    }

    assert ((granularity & (granularity - 1)) == 0);

804 805
    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
806
        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
807
        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
808 809 810
        return;
    }

W
Wen Congyang 已提交
811 812 813 814 815 816 817 818
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
    }

    if (buf_size == 0) {
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }
F
Fam Zheng 已提交
819

820 821 822 823 824 825 826 827 828 829 830 831 832 833 834
    /* We can't support this case as long as the block layer can't handle
     * multiple BlockBackends per BlockDriverState. */
    if (replaces) {
        replaced_bs = bdrv_lookup_bs(replaces, replaces, errp);
        if (replaced_bs == NULL) {
            return;
        }
    } else {
        replaced_bs = bs;
    }
    if (replaced_bs->blk && target->blk) {
        error_setg(errp, "Can't create node with two BlockBackends");
        return;
    }

F
Fam Zheng 已提交
835
    s = block_job_create(driver, bs, speed, cb, opaque, errp);
P
Paolo Bonzini 已提交
836 837 838 839
    if (!s) {
        return;
    }

840
    s->replaces = g_strdup(replaces);
841 842
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
P
Paolo Bonzini 已提交
843
    s->target = target;
F
Fam Zheng 已提交
844
    s->is_none_mode = is_none_mode;
F
Fam Zheng 已提交
845
    s->base = base;
846
    s->granularity = granularity;
W
Wen Congyang 已提交
847
    s->buf_size = ROUND_UP(buf_size, granularity);
848
    s->unmap = unmap;
849

850
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
851
    if (!s->dirty_bitmap) {
852
        g_free(s->replaces);
853
        block_job_unref(&s->common);
854 855
        return;
    }
856 857 858

    bdrv_op_block_all(s->target, s->common.blocker);

859 860 861 862
    if (s->target->blk) {
        blk_set_on_error(s->target->blk, on_target_error, on_target_error);
        blk_iostatus_enable(s->target->blk);
    }
P
Paolo Bonzini 已提交
863 864 865 866
    s->common.co = qemu_coroutine_create(mirror_run);
    trace_mirror_start(bs, s, s->common.co, opaque);
    qemu_coroutine_enter(s->common.co, s);
}
F
Fam Zheng 已提交
867 868

void mirror_start(BlockDriverState *bs, BlockDriverState *target,
869
                  const char *replaces,
870
                  int64_t speed, uint32_t granularity, int64_t buf_size,
F
Fam Zheng 已提交
871 872
                  MirrorSyncMode mode, BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
873
                  bool unmap,
874
                  BlockCompletionFunc *cb,
F
Fam Zheng 已提交
875 876 877 878 879
                  void *opaque, Error **errp)
{
    bool is_none_mode;
    BlockDriverState *base;

880 881
    if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        error_setg(errp, "Sync mode 'incremental' not supported");
882 883
        return;
    }
F
Fam Zheng 已提交
884
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
885
    base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
886 887
    mirror_start_job(bs, target, replaces,
                     speed, granularity, buf_size,
888
                     on_source_error, on_target_error, unmap, cb, opaque, errp,
F
Fam Zheng 已提交
889 890 891 892 893 894
                     &mirror_job_driver, is_none_mode, base);
}

void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
                         int64_t speed,
                         BlockdevOnError on_error,
895
                         BlockCompletionFunc *cb,
F
Fam Zheng 已提交
896 897
                         void *opaque, Error **errp)
{
898 899
    int64_t length, base_length;
    int orig_base_flags;
900
    int ret;
901
    Error *local_err = NULL;
902 903 904

    orig_base_flags = bdrv_get_flags(base);

F
Fam Zheng 已提交
905 906 907
    if (bdrv_reopen(base, bs->open_flags, errp)) {
        return;
    }
908 909 910

    length = bdrv_getlength(bs);
    if (length < 0) {
911 912
        error_setg_errno(errp, -length,
                         "Unable to determine length of %s", bs->filename);
913 914 915 916 917
        goto error_restore_flags;
    }

    base_length = bdrv_getlength(base);
    if (base_length < 0) {
918 919
        error_setg_errno(errp, -base_length,
                         "Unable to determine length of %s", base->filename);
920 921 922 923
        goto error_restore_flags;
    }

    if (length > base_length) {
924 925 926 927
        ret = bdrv_truncate(base, length);
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                            "Top image %s is larger than base image %s, and "
928 929 930 931 932 933
                             "resize of base image failed",
                             bs->filename, base->filename);
            goto error_restore_flags;
        }
    }

F
Fam Zheng 已提交
934
    bdrv_ref(base);
935
    mirror_start_job(bs, base, NULL, speed, 0, 0,
936
                     on_error, on_error, false, cb, opaque, &local_err,
F
Fam Zheng 已提交
937
                     &commit_active_job_driver, false, base);
938
    if (local_err) {
939
        error_propagate(errp, local_err);
940 941 942 943 944 945 946 947 948 949
        goto error_restore_flags;
    }

    return;

error_restore_flags:
    /* ignore error and errp for bdrv_reopen, because we want to propagate
     * the original error */
    bdrv_reopen(base, orig_base_flags, NULL);
    return;
F
Fam Zheng 已提交
950
}