mirror.c 33.4 KB
Newer Older
P
Paolo Bonzini 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Image mirroring
 *
 * Copyright Red Hat, Inc. 2012
 *
 * Authors:
 *  Paolo Bonzini  <pbonzini@redhat.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
14
#include "qemu/osdep.h"
P
Paolo Bonzini 已提交
15
#include "trace.h"
16 17
#include "block/blockjob.h"
#include "block/block_int.h"
18
#include "sysemu/block-backend.h"
19
#include "qapi/error.h"
20
#include "qapi/qmp/qerror.h"
P
Paolo Bonzini 已提交
21
#include "qemu/ratelimit.h"
22
#include "qemu/bitmap.h"
P
Paolo Bonzini 已提交
23

24 25
#define SLICE_TIME    100000000ULL /* ns */
#define MAX_IN_FLIGHT 16
W
Wen Congyang 已提交
26
#define DEFAULT_MIRROR_BUF_SIZE   (10 << 20)
27 28 29 30 31 32 33

/* The mirroring buffer is a list of granularity-sized chunks.
 * Free chunks are organized in a list.
 */
typedef struct MirrorBuffer {
    QSIMPLEQ_ENTRY(MirrorBuffer) next;
} MirrorBuffer;
P
Paolo Bonzini 已提交
34 35 36 37

typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
K
Kevin Wolf 已提交
38
    BlockBackend *target;
F
Fam Zheng 已提交
39
    BlockDriverState *base;
40 41 42 43 44 45
    /* The name of the graph node to replace */
    char *replaces;
    /* The BDS to replace */
    BlockDriverState *to_replace;
    /* Used to block operations on the drive-mirror-replace target */
    Error *replace_blocker;
F
Fam Zheng 已提交
46
    bool is_none_mode;
M
Max Reitz 已提交
47
    BlockMirrorBackingMode backing_mode;
48
    BlockdevOnError on_source_error, on_target_error;
P
Paolo Bonzini 已提交
49 50
    bool synced;
    bool should_complete;
51
    int64_t granularity;
52
    size_t buf_size;
M
Max Reitz 已提交
53
    int64_t bdev_length;
54
    unsigned long *cow_bitmap;
F
Fam Zheng 已提交
55
    BdrvDirtyBitmap *dirty_bitmap;
56
    HBitmapIter hbi;
P
Paolo Bonzini 已提交
57
    uint8_t *buf;
58 59
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;
60

61
    uint64_t last_pause_ns;
62
    unsigned long *in_flight_bitmap;
63
    int in_flight;
64
    int64_t sectors_in_flight;
65
    int ret;
66
    bool unmap;
K
Kevin Wolf 已提交
67
    bool waiting_for_io;
F
Fam Zheng 已提交
68 69
    int target_cluster_sectors;
    int max_iov;
P
Paolo Bonzini 已提交
70 71
} MirrorBlockJob;

72 73 74 75 76 77 78
typedef struct MirrorOp {
    MirrorBlockJob *s;
    QEMUIOVector qiov;
    int64_t sector_num;
    int nb_sectors;
} MirrorOp;

79 80 81 82 83
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                            int error)
{
    s->synced = false;
    if (read) {
84 85
        return block_job_error_action(&s->common, s->on_source_error,
                                      true, error);
86
    } else {
87 88
        return block_job_error_action(&s->common, s->on_target_error,
                                      false, error);
89 90 91
    }
}

92 93 94
static void mirror_iteration_done(MirrorOp *op, int ret)
{
    MirrorBlockJob *s = op->s;
95
    struct iovec *iov;
96
    int64_t chunk_num;
97
    int i, nb_chunks, sectors_per_chunk;
98 99 100 101

    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
M
Max Reitz 已提交
102
    s->sectors_in_flight -= op->nb_sectors;
103 104 105 106 107 108 109
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
        s->buf_free_count++;
    }

110 111
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    chunk_num = op->sector_num / sectors_per_chunk;
112
    nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk);
113
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
M
Max Reitz 已提交
114 115 116 117 118
    if (ret >= 0) {
        if (s->cow_bitmap) {
            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
        }
        s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
119 120
    }

Z
Zhang Min 已提交
121
    qemu_iovec_destroy(&op->qiov);
122
    g_free(op);
123

K
Kevin Wolf 已提交
124
    if (s->waiting_for_io) {
125
        qemu_coroutine_enter(s->common.co);
126
    }
127 128 129 130 131 132 133 134 135
}

static void mirror_write_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
    if (ret < 0) {
        BlockErrorAction action;

136
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
137
        action = mirror_error_action(s, false, -ret);
W
Wenchao Xia 已提交
138
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
139 140 141 142 143 144 145 146 147 148 149 150 151
            s->ret = ret;
        }
    }
    mirror_iteration_done(op, ret);
}

static void mirror_read_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
    if (ret < 0) {
        BlockErrorAction action;

152
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
153
        action = mirror_error_action(s, true, -ret);
W
Wenchao Xia 已提交
154
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
155 156 157 158 159 160
            s->ret = ret;
        }

        mirror_iteration_done(op, ret);
        return;
    }
K
Kevin Wolf 已提交
161
    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
162
                    0, mirror_write_complete, op);
163 164
}

165 166 167 168 169 170 171 172
static inline void mirror_clip_sectors(MirrorBlockJob *s,
                                       int64_t sector_num,
                                       int *nb_sectors)
{
    *nb_sectors = MIN(*nb_sectors,
                      s->bdev_length / BDRV_SECTOR_SIZE - sector_num);
}

F
Fam Zheng 已提交
173 174 175 176 177
/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
 * return the offset of the adjusted tail sector against original. */
static int mirror_cow_align(MirrorBlockJob *s,
                            int64_t *sector_num,
                            int *nb_sectors)
P
Paolo Bonzini 已提交
178
{
F
Fam Zheng 已提交
179 180 181 182 183 184 185 186 187 188 189
    bool need_cow;
    int ret = 0;
    int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS;
    int64_t align_sector_num = *sector_num;
    int align_nb_sectors = *nb_sectors;
    int max_sectors = chunk_sectors * s->max_iov;

    need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap);
    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                          s->cow_bitmap);
    if (need_cow) {
190 191 192
        bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
                                       *nb_sectors, &align_sector_num,
                                       &align_nb_sectors);
F
Fam Zheng 已提交
193
    }
194

F
Fam Zheng 已提交
195 196 197 198 199 200
    if (align_nb_sectors > max_sectors) {
        align_nb_sectors = max_sectors;
        if (need_cow) {
            align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors,
                                               s->target_cluster_sectors);
        }
201
    }
202 203 204
    /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but
     * that doesn't matter because it's already the end of source image. */
    mirror_clip_sectors(s, align_sector_num, &align_nb_sectors);
205

F
Fam Zheng 已提交
206 207 208 209 210 211 212
    ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
    *sector_num = align_sector_num;
    *nb_sectors = align_nb_sectors;
    assert(ret >= 0);
    return ret;
}

F
Fam Zheng 已提交
213 214 215 216 217 218 219 220
static inline void mirror_wait_for_io(MirrorBlockJob *s)
{
    assert(!s->waiting_for_io);
    s->waiting_for_io = true;
    qemu_coroutine_yield();
    s->waiting_for_io = false;
}

F
Fam Zheng 已提交
221
/* Submit async read while handling COW.
222 223 224
 * Returns: The number of sectors copied after and including sector_num,
 *          excluding any sectors copied prior to sector_num due to alignment.
 *          This will be nb_sectors if no alignment is necessary, or
F
Fam Zheng 已提交
225 226 227 228 229 230
 *          (new_end - sector_num) if tail is rounded up or down due to
 *          alignment or buffer limit.
 */
static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                          int nb_sectors)
{
K
Kevin Wolf 已提交
231
    BlockBackend *source = s->common.blk;
F
Fam Zheng 已提交
232
    int sectors_per_chunk, nb_chunks;
233
    int ret;
F
Fam Zheng 已提交
234
    MirrorOp *op;
235
    int max_sectors;
F
Fam Zheng 已提交
236

237
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
238
    max_sectors = sectors_per_chunk * s->max_iov;
239

F
Fam Zheng 已提交
240 241
    /* We can only handle as much as buf_size at a time. */
    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
242
    nb_sectors = MIN(max_sectors, nb_sectors);
F
Fam Zheng 已提交
243
    assert(nb_sectors);
244
    ret = nb_sectors;
245

F
Fam Zheng 已提交
246 247 248 249 250 251 252 253
    if (s->cow_bitmap) {
        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
    }
    assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size);
    /* The sector range must meet granularity because:
     * 1) Caller passes in aligned values;
     * 2) mirror_cow_align is used only when target cluster is larger. */
    assert(!(sector_num % sectors_per_chunk));
254
    nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk);
F
Fam Zheng 已提交
255 256

    while (s->buf_free_count < nb_chunks) {
257
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
F
Fam Zheng 已提交
258
        mirror_wait_for_io(s);
259 260
    }

261
    /* Allocate a MirrorOp that is used as an AIO callback.  */
262
    op = g_new(MirrorOp, 1);
263 264 265
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;
266 267 268 269 270 271 272

    /* Now make a QEMUIOVector taking enough granularity-sized chunks
     * from s->buf_free.
     */
    qemu_iovec_init(&op->qiov, nb_chunks);
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
F
Fam Zheng 已提交
273
        size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size;
274

275 276
        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
277
        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
278
    }
279

P
Paolo Bonzini 已提交
280
    /* Copy the dirty cluster.  */
281
    s->in_flight++;
M
Max Reitz 已提交
282
    s->sectors_in_flight += nb_sectors;
283
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
284

285
    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
F
Fam Zheng 已提交
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
                   mirror_read_complete, op);
    return ret;
}

static void mirror_do_zero_or_discard(MirrorBlockJob *s,
                                      int64_t sector_num,
                                      int nb_sectors,
                                      bool is_discard)
{
    MirrorOp *op;

    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
     * so the freeing in mirror_iteration_done is nop. */
    op = g_new0(MirrorOp, 1);
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;

    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    if (is_discard) {
K
Kevin Wolf 已提交
307 308
        blk_aio_discard(s->target, sector_num, op->nb_sectors,
                        mirror_write_complete, op);
F
Fam Zheng 已提交
309
    } else {
K
Kevin Wolf 已提交
310 311
        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
                              op->nb_sectors * BDRV_SECTOR_SIZE,
312 313
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
F
Fam Zheng 已提交
314 315 316 317 318
    }
}

static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
{
K
Kevin Wolf 已提交
319
    BlockDriverState *source = blk_bs(s->common.blk);
320
    int64_t sector_num, first_chunk;
F
Fam Zheng 已提交
321 322 323 324 325 326 327 328 329 330 331 332 333 334
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
    int nb_chunks = 1;
    int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
    int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;

    sector_num = hbitmap_iter_next(&s->hbi);
    if (sector_num < 0) {
        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
        sector_num = hbitmap_iter_next(&s->hbi);
        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
        assert(sector_num >= 0);
    }

335 336
    first_chunk = sector_num / sectors_per_chunk;
    while (test_bit(first_chunk, s->in_flight_bitmap)) {
337
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
338 339 340
        mirror_wait_for_io(s);
    }

341 342
    block_job_pause_point(&s->common);

F
Fam Zheng 已提交
343 344 345 346 347 348 349 350 351 352 353
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
        int64_t hbitmap_next;
        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
        int64_t next_chunk = next_sector / sectors_per_chunk;
        if (next_sector >= end ||
            !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
            break;
        }
        if (test_bit(next_chunk, s->in_flight_bitmap)) {
354
            break;
F
Fam Zheng 已提交
355
        }
356 357

        hbitmap_next = hbitmap_iter_next(&s->hbi);
358 359 360 361 362
        if (hbitmap_next > next_sector || hbitmap_next < 0) {
            /* The bitmap iterator's cache is stale, refresh it */
            bdrv_set_dirty_iter(&s->hbi, next_sector);
            hbitmap_next = hbitmap_iter_next(&s->hbi);
        }
363 364
        assert(hbitmap_next == next_sector);
        nb_chunks++;
F
Fam Zheng 已提交
365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
    }

    /* Clear dirty bits before querying the block status, because
     * calling bdrv_get_block_status_above could yield - if some blocks are
     * marked dirty in this window, we need to know.
     */
    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num,
                            nb_chunks * sectors_per_chunk);
    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
    while (nb_chunks > 0 && sector_num < end) {
        int ret;
        int io_sectors;
        BlockDriverState *file;
        enum MirrorMethod {
            MIRROR_METHOD_COPY,
            MIRROR_METHOD_ZERO,
            MIRROR_METHOD_DISCARD
        } mirror_method = MIRROR_METHOD_COPY;

        assert(!(sector_num % sectors_per_chunk));
        ret = bdrv_get_block_status_above(source, NULL, sector_num,
                                          nb_chunks * sectors_per_chunk,
                                          &io_sectors, &file);
        if (ret < 0) {
            io_sectors = nb_chunks * sectors_per_chunk;
        }

        io_sectors -= io_sectors % sectors_per_chunk;
        if (io_sectors < sectors_per_chunk) {
            io_sectors = sectors_per_chunk;
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_sector_num;
            int target_nb_sectors;
398 399 400
            bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
                                           io_sectors,  &target_sector_num,
                                           &target_nb_sectors);
F
Fam Zheng 已提交
401 402 403 404 405 406 407 408
            if (target_sector_num == sector_num &&
                target_nb_sectors == io_sectors) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
                                    MIRROR_METHOD_ZERO :
                                    MIRROR_METHOD_DISCARD;
            }
        }

409
        mirror_clip_sectors(s, sector_num, &io_sectors);
F
Fam Zheng 已提交
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
        switch (mirror_method) {
        case MIRROR_METHOD_COPY:
            io_sectors = mirror_do_read(s, sector_num, io_sectors);
            break;
        case MIRROR_METHOD_ZERO:
            mirror_do_zero_or_discard(s, sector_num, io_sectors, false);
            break;
        case MIRROR_METHOD_DISCARD:
            mirror_do_zero_or_discard(s, sector_num, io_sectors, true);
            break;
        default:
            abort();
        }
        assert(io_sectors);
        sector_num += io_sectors;
425
        nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
426 427 428
        if (s->common.speed) {
            delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors);
        }
429
    }
430
    return delay_ns;
431
}
432

433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
static void mirror_free_init(MirrorBlockJob *s)
{
    int granularity = s->granularity;
    size_t buf_size = s->buf_size;
    uint8_t *buf = s->buf;

    assert(s->buf_free_count == 0);
    QSIMPLEQ_INIT(&s->buf_free);
    while (buf_size != 0) {
        MirrorBuffer *cur = (MirrorBuffer *)buf;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
        s->buf_free_count++;
        buf_size -= granularity;
        buf += granularity;
    }
}

450 451 452
static void mirror_drain(MirrorBlockJob *s)
{
    while (s->in_flight > 0) {
F
Fam Zheng 已提交
453
        mirror_wait_for_io(s);
454
    }
P
Paolo Bonzini 已提交
455 456
}

457 458 459 460 461 462 463 464 465
typedef struct {
    int ret;
} MirrorExitData;

static void mirror_exit(BlockJob *job, void *opaque)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
K
Kevin Wolf 已提交
466 467
    BlockDriverState *src = blk_bs(s->common.blk);
    BlockDriverState *target_bs = blk_bs(s->target);
468 469 470 471

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
    bdrv_ref(src);
472 473 474 475 476 477 478

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);
    }

    if (s->should_complete && data->ret == 0) {
K
Kevin Wolf 已提交
479
        BlockDriverState *to_replace = src;
480 481 482
        if (s->to_replace) {
            to_replace = s->to_replace;
        }
483

K
Kevin Wolf 已提交
484 485
        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
486
        }
487 488 489

        /* The mirror job has no requests in flight any more, but we need to
         * drain potential other users of the BDS before changing the graph. */
K
Kevin Wolf 已提交
490 491 492
        bdrv_drained_begin(target_bs);
        bdrv_replace_in_backing_chain(to_replace, target_bs);
        bdrv_drained_end(target_bs);
493

494 495 496
        /* We just changed the BDS the job BB refers to */
        blk_remove_bs(job->blk);
        blk_insert_bs(job->blk, src);
497 498 499 500 501 502 503 504 505 506
    }
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
        bdrv_unref(s->to_replace);
    }
    if (replace_aio_context) {
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
K
Kevin Wolf 已提交
507 508
    bdrv_op_unblock_all(target_bs, s->common.blocker);
    blk_unref(s->target);
509 510
    block_job_completed(&s->common, data->ret);
    g_free(data);
511
    bdrv_drained_end(src);
512 513 514
    if (qemu_get_aio_context() == bdrv_get_aio_context(src)) {
        aio_enable_external(iohandler_get_aio_context());
    }
515
    bdrv_unref(src);
516 517
}

518 519 520 521 522 523 524 525 526 527 528 529
static void mirror_throttle(MirrorBlockJob *s)
{
    int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);

    if (now - s->last_pause_ns > SLICE_TIME) {
        s->last_pause_ns = now;
        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
    } else {
        block_job_pause_point(&s->common);
    }
}

530 531 532 533 534 535 536 537 538 539
static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
{
    int64_t sector_num, end;
    BlockDriverState *base = s->base;
    BlockDriverState *bs = blk_bs(s->common.blk);
    BlockDriverState *target_bs = blk_bs(s->target);
    int ret, n;

    end = s->bdev_length / BDRV_SECTOR_SIZE;

540 541 542 543 544
    if (base == NULL && !bdrv_has_zero_init(target_bs)) {
        bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, end);
        return 0;
    }

545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562
    /* First part, loop on the sectors and initialize the dirty bitmap.  */
    for (sector_num = 0; sector_num < end; ) {
        /* Just to make sure we are not exceeding int limit. */
        int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
                             end - sector_num);

        mirror_throttle(s);

        if (block_job_is_cancelled(&s->common)) {
            return 0;
        }

        ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
        if (ret < 0) {
            return ret;
        }

        assert(n > 0);
563
        if (ret == 1) {
564 565 566 567 568 569 570
            bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
        }
        sector_num += n;
    }
    return 0;
}

P
Paolo Bonzini 已提交
571 572 573
static void coroutine_fn mirror_run(void *opaque)
{
    MirrorBlockJob *s = opaque;
574
    MirrorExitData *data;
K
Kevin Wolf 已提交
575 576
    BlockDriverState *bs = blk_bs(s->common.blk);
    BlockDriverState *target_bs = blk_bs(s->target);
577
    int64_t length;
578
    BlockDriverInfo bdi;
579 580
    char backing_filename[2]; /* we only need 2 characters because we are only
                                 checking for a NULL string */
P
Paolo Bonzini 已提交
581
    int ret = 0;
F
Fam Zheng 已提交
582
    int target_cluster_size = BDRV_SECTOR_SIZE;
P
Paolo Bonzini 已提交
583 584 585 586 587

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
    }

M
Max Reitz 已提交
588 589 590
    s->bdev_length = bdrv_getlength(bs);
    if (s->bdev_length < 0) {
        ret = s->bdev_length;
591
        goto immediate_exit;
M
Max Reitz 已提交
592
    } else if (s->bdev_length == 0) {
593 594 595 596 597 598 599 600
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
        while (!block_job_is_cancelled(&s->common) && !s->should_complete) {
            block_job_yield(&s->common);
        }
        s->common.cancelled = false;
        goto immediate_exit;
P
Paolo Bonzini 已提交
601 602
    }

M
Max Reitz 已提交
603
    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
604 605
    s->in_flight_bitmap = bitmap_new(length);

606 607 608 609
    /* If we have no backing file yet in the destination, we cannot let
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
K
Kevin Wolf 已提交
610
    bdrv_get_backing_filename(target_bs, backing_filename,
611
                              sizeof(backing_filename));
K
Kevin Wolf 已提交
612
    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
F
Fam Zheng 已提交
613 614
        target_cluster_size = bdi.cluster_size;
    }
K
Kevin Wolf 已提交
615
    if (backing_filename[0] && !target_bs->backing
F
Fam Zheng 已提交
616 617 618
        && s->granularity < target_cluster_size) {
        s->buf_size = MAX(s->buf_size, target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
619
    }
F
Fam Zheng 已提交
620
    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
K
Kevin Wolf 已提交
621
    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
622

623 624 625 626 627 628
    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
        ret = -ENOMEM;
        goto immediate_exit;
    }

629
    mirror_free_init(s);
P
Paolo Bonzini 已提交
630

631
    s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
F
Fam Zheng 已提交
632
    if (!s->is_none_mode) {
633 634 635
        ret = mirror_dirty_init(s);
        if (ret < 0 || block_job_is_cancelled(&s->common)) {
            goto immediate_exit;
P
Paolo Bonzini 已提交
636 637 638
        }
    }

639
    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
P
Paolo Bonzini 已提交
640
    for (;;) {
641
        uint64_t delay_ns = 0;
642
        int64_t cnt, delta;
P
Paolo Bonzini 已提交
643 644
        bool should_complete;

645 646 647 648 649
        if (s->ret < 0) {
            ret = s->ret;
            goto immediate_exit;
        }

650 651
        block_job_pause_point(&s->common);

652
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
M
Max Reitz 已提交
653 654 655 656 657 658
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
         * s->sectors_in_flight is the number of sectors currently being
         * processed; together those are the current total operation length */
        s->common.len = s->common.offset +
                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
659 660

        /* Note that even when no rate limit is applied we need to yield
661
         * periodically with no pending I/O so that bdrv_drain_all() returns.
662 663 664
         * We do so every SLICE_TIME nanoseconds, or when there is an error,
         * or when the source is clean, whichever comes first.
         */
665 666
        delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
        if (delta < SLICE_TIME &&
667
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
668 669 670
            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                (cnt == 0 && s->in_flight > 0)) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
F
Fam Zheng 已提交
671
                mirror_wait_for_io(s);
672 673
                continue;
            } else if (cnt != 0) {
674
                delay_ns = mirror_iteration(s);
P
Paolo Bonzini 已提交
675 676 677 678
            }
        }

        should_complete = false;
679
        if (s->in_flight == 0 && cnt == 0) {
P
Paolo Bonzini 已提交
680
            trace_mirror_before_flush(s);
K
Kevin Wolf 已提交
681
            ret = blk_flush(s->target);
P
Paolo Bonzini 已提交
682
            if (ret < 0) {
W
Wenchao Xia 已提交
683 684
                if (mirror_error_action(s, false, -ret) ==
                    BLOCK_ERROR_ACTION_REPORT) {
685 686 687 688 689 690 691 692 693
                    goto immediate_exit;
                }
            } else {
                /* We're out of the streaming phase.  From now on, if the job
                 * is cancelled we will actually complete all pending I/O and
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
                if (!s->synced) {
694
                    block_job_event_ready(&s->common);
695 696 697 698 699
                    s->synced = true;
                }

                should_complete = s->should_complete ||
                    block_job_is_cancelled(&s->common);
700
                cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
701
            }
P
Paolo Bonzini 已提交
702 703 704 705 706 707 708 709 710 711 712 713
        }

        if (cnt == 0 && should_complete) {
            /* The dirty bitmap is not updated while operations are pending.
             * If we're about to exit, wait for pending operations before
             * calling bdrv_get_dirty_count(bs), or we may exit while the
             * source has dirty data to copy!
             *
             * Note that I/O can be submitted by the guest while
             * mirror_populate runs.
             */
            trace_mirror_before_drain(s, cnt);
714
            bdrv_co_drain(bs);
715
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
716 717 718
        }

        ret = 0;
719
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
P
Paolo Bonzini 已提交
720
        if (!s->synced) {
721
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
722 723 724 725
            if (block_job_is_cancelled(&s->common)) {
                break;
            }
        } else if (!should_complete) {
726
            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
727
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
728 729 730 731 732 733 734 735
        } else if (cnt == 0) {
            /* The two disks are in sync.  Exit and report successful
             * completion.
             */
            assert(QLIST_EMPTY(&bs->tracked_requests));
            s->common.cancelled = false;
            break;
        }
736
        s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
P
Paolo Bonzini 已提交
737 738 739
    }

immediate_exit:
740 741 742 743 744 745 746 747 748 749
    if (s->in_flight > 0) {
        /* We get here only if something went wrong.  Either the job failed,
         * or it was cancelled prematurely so that we do not guarantee that
         * the target is a copy of the source.
         */
        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
        mirror_drain(s);
    }

    assert(s->in_flight == 0);
750
    qemu_vfree(s->buf);
751
    g_free(s->cow_bitmap);
752
    g_free(s->in_flight_bitmap);
F
Fam Zheng 已提交
753
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
754 755 756

    data = g_malloc(sizeof(*data));
    data->ret = ret;
757 758
    /* Before we switch to target in mirror_exit, make sure data doesn't
     * change. */
K
Kevin Wolf 已提交
759
    bdrv_drained_begin(bs);
760 761 762 763 764 765
    if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) {
        /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the
         * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we
         * need a block layer API change to achieve this. */
        aio_disable_external(iohandler_get_aio_context());
    }
766
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
P
Paolo Bonzini 已提交
767 768 769 770 771 772 773
}

static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    if (speed < 0) {
774
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
P
Paolo Bonzini 已提交
775 776 777 778 779
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

P
Paolo Bonzini 已提交
780 781 782
static void mirror_complete(BlockJob *job, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
M
Max Reitz 已提交
783 784 785 786
    BlockDriverState *src, *target;

    src = blk_bs(job->blk);
    target = blk_bs(s->target);
P
Paolo Bonzini 已提交
787 788

    if (!s->synced) {
789 790
        error_setg(errp, "The active block job '%s' cannot be completed",
                   job->id);
P
Paolo Bonzini 已提交
791 792 793
        return;
    }

M
Max Reitz 已提交
794 795 796 797 798 799 800 801 802 803
    if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
        int ret;

        assert(!target->backing);
        ret = bdrv_open_backing_file(target, NULL, "backing", errp);
        if (ret < 0) {
            return;
        }
    }

C
Changlong Xie 已提交
804
    /* block all operations on to_replace bs */
805
    if (s->replaces) {
806 807
        AioContext *replace_aio_context;

808
        s->to_replace = bdrv_find_node(s->replaces);
809
        if (!s->to_replace) {
810
            error_setg(errp, "Node name '%s' not found", s->replaces);
811 812 813
            return;
        }

814 815 816
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);

817 818 819 820
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
821 822

        aio_context_release(replace_aio_context);
823 824
    }

M
Max Reitz 已提交
825 826 827 828 829 830 831
    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
        BlockDriverState *backing = s->is_none_mode ? src : s->base;
        if (backing_bs(target) != backing) {
            bdrv_set_backing_hd(target, backing);
        }
    }

P
Paolo Bonzini 已提交
832
    s->should_complete = true;
F
Fam Zheng 已提交
833
    block_job_enter(&s->common);
P
Paolo Bonzini 已提交
834 835
}

836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852
/* There is no matching mirror_resume() because mirror_run() will begin
 * iterating again when the job is resumed.
 */
static void coroutine_fn mirror_pause(BlockJob *job)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    mirror_drain(s);
}

static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    blk_set_aio_context(s->target, new_context);
}

853
static const BlockJobDriver mirror_job_driver = {
854 855 856 857 858 859
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_MIRROR,
    .set_speed              = mirror_set_speed,
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
P
Paolo Bonzini 已提交
860 861
};

F
Fam Zheng 已提交
862
static const BlockJobDriver commit_active_job_driver = {
863 864 865 866 867 868
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_COMMIT,
    .set_speed              = mirror_set_speed,
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
F
Fam Zheng 已提交
869 870
};

871 872
static void mirror_start_job(const char *job_id, BlockDriverState *bs,
                             BlockDriverState *target, const char *replaces,
873
                             int64_t speed, uint32_t granularity,
874
                             int64_t buf_size,
M
Max Reitz 已提交
875
                             BlockMirrorBackingMode backing_mode,
876 877
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
878
                             bool unmap,
879
                             BlockCompletionFunc *cb,
880 881 882
                             void *opaque, Error **errp,
                             const BlockJobDriver *driver,
                             bool is_none_mode, BlockDriverState *base)
P
Paolo Bonzini 已提交
883 884 885
{
    MirrorBlockJob *s;

886
    if (granularity == 0) {
887
        granularity = bdrv_get_default_bitmap_granularity(target);
888 889 890 891
    }

    assert ((granularity & (granularity - 1)) == 0);

W
Wen Congyang 已提交
892 893 894 895 896 897 898 899
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
    }

    if (buf_size == 0) {
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }
F
Fam Zheng 已提交
900

901
    s = block_job_create(job_id, driver, bs, speed, cb, opaque, errp);
P
Paolo Bonzini 已提交
902 903 904 905
    if (!s) {
        return;
    }

K
Kevin Wolf 已提交
906 907 908
    s->target = blk_new();
    blk_insert_bs(s->target, target);

909
    s->replaces = g_strdup(replaces);
910 911
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
F
Fam Zheng 已提交
912
    s->is_none_mode = is_none_mode;
M
Max Reitz 已提交
913
    s->backing_mode = backing_mode;
F
Fam Zheng 已提交
914
    s->base = base;
915
    s->granularity = granularity;
W
Wen Congyang 已提交
916
    s->buf_size = ROUND_UP(buf_size, granularity);
917
    s->unmap = unmap;
918

919
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
920
    if (!s->dirty_bitmap) {
921
        g_free(s->replaces);
K
Kevin Wolf 已提交
922
        blk_unref(s->target);
923
        block_job_unref(&s->common);
924 925
        return;
    }
926

K
Kevin Wolf 已提交
927
    bdrv_op_block_all(target, s->common.blocker);
928

929
    s->common.co = qemu_coroutine_create(mirror_run, s);
P
Paolo Bonzini 已提交
930
    trace_mirror_start(bs, s, s->common.co, opaque);
931
    qemu_coroutine_enter(s->common.co);
P
Paolo Bonzini 已提交
932
}
F
Fam Zheng 已提交
933

934 935
void mirror_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *target, const char *replaces,
936
                  int64_t speed, uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
937 938
                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                  BlockdevOnError on_source_error,
F
Fam Zheng 已提交
939
                  BlockdevOnError on_target_error,
940
                  bool unmap,
941
                  BlockCompletionFunc *cb,
F
Fam Zheng 已提交
942 943 944 945 946
                  void *opaque, Error **errp)
{
    bool is_none_mode;
    BlockDriverState *base;

947 948
    if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        error_setg(errp, "Sync mode 'incremental' not supported");
949 950
        return;
    }
F
Fam Zheng 已提交
951
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
952
    base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
953
    mirror_start_job(job_id, bs, target, replaces,
M
Max Reitz 已提交
954
                     speed, granularity, buf_size, backing_mode,
955
                     on_source_error, on_target_error, unmap, cb, opaque, errp,
F
Fam Zheng 已提交
956 957 958
                     &mirror_job_driver, is_none_mode, base);
}

959 960
void commit_active_start(const char *job_id, BlockDriverState *bs,
                         BlockDriverState *base, int64_t speed,
F
Fam Zheng 已提交
961
                         BlockdevOnError on_error,
962
                         BlockCompletionFunc *cb,
F
Fam Zheng 已提交
963 964
                         void *opaque, Error **errp)
{
965 966
    int64_t length, base_length;
    int orig_base_flags;
967
    int ret;
968
    Error *local_err = NULL;
969 970 971

    orig_base_flags = bdrv_get_flags(base);

F
Fam Zheng 已提交
972 973 974
    if (bdrv_reopen(base, bs->open_flags, errp)) {
        return;
    }
975 976 977

    length = bdrv_getlength(bs);
    if (length < 0) {
978 979
        error_setg_errno(errp, -length,
                         "Unable to determine length of %s", bs->filename);
980 981 982 983 984
        goto error_restore_flags;
    }

    base_length = bdrv_getlength(base);
    if (base_length < 0) {
985 986
        error_setg_errno(errp, -base_length,
                         "Unable to determine length of %s", base->filename);
987 988 989 990
        goto error_restore_flags;
    }

    if (length > base_length) {
991 992 993 994
        ret = bdrv_truncate(base, length);
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                            "Top image %s is larger than base image %s, and "
995 996 997 998 999 1000
                             "resize of base image failed",
                             bs->filename, base->filename);
            goto error_restore_flags;
        }
    }

1001
    mirror_start_job(job_id, bs, base, NULL, speed, 0, 0,
1002
                     MIRROR_LEAVE_BACKING_CHAIN,
1003
                     on_error, on_error, false, cb, opaque, &local_err,
F
Fam Zheng 已提交
1004
                     &commit_active_job_driver, false, base);
1005
    if (local_err) {
1006
        error_propagate(errp, local_err);
1007 1008 1009 1010 1011 1012 1013 1014 1015 1016
        goto error_restore_flags;
    }

    return;

error_restore_flags:
    /* ignore error and errp for bdrv_reopen, because we want to propagate
     * the original error */
    bdrv_reopen(base, orig_base_flags, NULL);
    return;
F
Fam Zheng 已提交
1017
}