mirror.c 33.3 KB
Newer Older
P
Paolo Bonzini 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Image mirroring
 *
 * Copyright Red Hat, Inc. 2012
 *
 * Authors:
 *  Paolo Bonzini  <pbonzini@redhat.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
14
#include "qemu/osdep.h"
P
Paolo Bonzini 已提交
15
#include "trace.h"
16 17
#include "block/blockjob.h"
#include "block/block_int.h"
18
#include "sysemu/block-backend.h"
19
#include "qapi/error.h"
20
#include "qapi/qmp/qerror.h"
P
Paolo Bonzini 已提交
21
#include "qemu/ratelimit.h"
22
#include "qemu/bitmap.h"
P
Paolo Bonzini 已提交
23

24 25
#define SLICE_TIME    100000000ULL /* ns */
#define MAX_IN_FLIGHT 16
W
Wen Congyang 已提交
26
#define DEFAULT_MIRROR_BUF_SIZE   (10 << 20)
27 28 29 30 31 32 33

/* The mirroring buffer is a list of granularity-sized chunks.
 * Free chunks are organized in a list.
 */
typedef struct MirrorBuffer {
    QSIMPLEQ_ENTRY(MirrorBuffer) next;
} MirrorBuffer;
P
Paolo Bonzini 已提交
34 35 36 37

typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
K
Kevin Wolf 已提交
38
    BlockBackend *target;
F
Fam Zheng 已提交
39
    BlockDriverState *base;
40 41 42 43 44 45
    /* The name of the graph node to replace */
    char *replaces;
    /* The BDS to replace */
    BlockDriverState *to_replace;
    /* Used to block operations on the drive-mirror-replace target */
    Error *replace_blocker;
F
Fam Zheng 已提交
46
    bool is_none_mode;
M
Max Reitz 已提交
47
    BlockMirrorBackingMode backing_mode;
48
    BlockdevOnError on_source_error, on_target_error;
P
Paolo Bonzini 已提交
49 50
    bool synced;
    bool should_complete;
51
    int64_t granularity;
52
    size_t buf_size;
M
Max Reitz 已提交
53
    int64_t bdev_length;
54
    unsigned long *cow_bitmap;
F
Fam Zheng 已提交
55
    BdrvDirtyBitmap *dirty_bitmap;
56
    HBitmapIter hbi;
P
Paolo Bonzini 已提交
57
    uint8_t *buf;
58 59
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;
60

61
    uint64_t last_pause_ns;
62
    unsigned long *in_flight_bitmap;
63
    int in_flight;
64
    int64_t sectors_in_flight;
65
    int ret;
66
    bool unmap;
K
Kevin Wolf 已提交
67
    bool waiting_for_io;
F
Fam Zheng 已提交
68 69
    int target_cluster_sectors;
    int max_iov;
P
Paolo Bonzini 已提交
70 71
} MirrorBlockJob;

72 73 74 75 76 77 78
typedef struct MirrorOp {
    MirrorBlockJob *s;
    QEMUIOVector qiov;
    int64_t sector_num;
    int nb_sectors;
} MirrorOp;

79 80 81 82 83
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                            int error)
{
    s->synced = false;
    if (read) {
84 85
        return block_job_error_action(&s->common, s->on_source_error,
                                      true, error);
86
    } else {
87 88
        return block_job_error_action(&s->common, s->on_target_error,
                                      false, error);
89 90 91
    }
}

92 93 94
static void mirror_iteration_done(MirrorOp *op, int ret)
{
    MirrorBlockJob *s = op->s;
95
    struct iovec *iov;
96
    int64_t chunk_num;
97
    int i, nb_chunks, sectors_per_chunk;
98 99 100 101

    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
M
Max Reitz 已提交
102
    s->sectors_in_flight -= op->nb_sectors;
103 104 105 106 107 108 109
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
        s->buf_free_count++;
    }

110 111
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    chunk_num = op->sector_num / sectors_per_chunk;
112
    nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk);
113
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
M
Max Reitz 已提交
114 115 116 117 118
    if (ret >= 0) {
        if (s->cow_bitmap) {
            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
        }
        s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
119 120
    }

Z
Zhang Min 已提交
121
    qemu_iovec_destroy(&op->qiov);
122
    g_free(op);
123

K
Kevin Wolf 已提交
124
    if (s->waiting_for_io) {
125
        qemu_coroutine_enter(s->common.co);
126
    }
127 128 129 130 131 132 133 134 135
}

static void mirror_write_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
    if (ret < 0) {
        BlockErrorAction action;

136
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
137
        action = mirror_error_action(s, false, -ret);
W
Wenchao Xia 已提交
138
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
139 140 141 142 143 144 145 146 147 148 149 150 151
            s->ret = ret;
        }
    }
    mirror_iteration_done(op, ret);
}

static void mirror_read_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
    if (ret < 0) {
        BlockErrorAction action;

152
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
153
        action = mirror_error_action(s, true, -ret);
W
Wenchao Xia 已提交
154
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
155 156 157 158 159 160
            s->ret = ret;
        }

        mirror_iteration_done(op, ret);
        return;
    }
K
Kevin Wolf 已提交
161
    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
162
                    0, mirror_write_complete, op);
163 164
}

165 166 167 168 169 170 171 172
static inline void mirror_clip_sectors(MirrorBlockJob *s,
                                       int64_t sector_num,
                                       int *nb_sectors)
{
    *nb_sectors = MIN(*nb_sectors,
                      s->bdev_length / BDRV_SECTOR_SIZE - sector_num);
}

F
Fam Zheng 已提交
173 174 175 176 177
/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
 * return the offset of the adjusted tail sector against original. */
static int mirror_cow_align(MirrorBlockJob *s,
                            int64_t *sector_num,
                            int *nb_sectors)
P
Paolo Bonzini 已提交
178
{
F
Fam Zheng 已提交
179 180 181 182 183 184 185 186 187 188 189
    bool need_cow;
    int ret = 0;
    int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS;
    int64_t align_sector_num = *sector_num;
    int align_nb_sectors = *nb_sectors;
    int max_sectors = chunk_sectors * s->max_iov;

    need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap);
    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                          s->cow_bitmap);
    if (need_cow) {
190 191 192
        bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
                                       *nb_sectors, &align_sector_num,
                                       &align_nb_sectors);
F
Fam Zheng 已提交
193
    }
194

F
Fam Zheng 已提交
195 196 197 198 199 200
    if (align_nb_sectors > max_sectors) {
        align_nb_sectors = max_sectors;
        if (need_cow) {
            align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors,
                                               s->target_cluster_sectors);
        }
201
    }
202 203 204
    /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but
     * that doesn't matter because it's already the end of source image. */
    mirror_clip_sectors(s, align_sector_num, &align_nb_sectors);
205

F
Fam Zheng 已提交
206 207 208 209 210 211 212
    ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
    *sector_num = align_sector_num;
    *nb_sectors = align_nb_sectors;
    assert(ret >= 0);
    return ret;
}

F
Fam Zheng 已提交
213 214 215 216 217 218 219 220
static inline void mirror_wait_for_io(MirrorBlockJob *s)
{
    assert(!s->waiting_for_io);
    s->waiting_for_io = true;
    qemu_coroutine_yield();
    s->waiting_for_io = false;
}

F
Fam Zheng 已提交
221
/* Submit async read while handling COW.
222 223 224
 * Returns: The number of sectors copied after and including sector_num,
 *          excluding any sectors copied prior to sector_num due to alignment.
 *          This will be nb_sectors if no alignment is necessary, or
F
Fam Zheng 已提交
225 226 227 228 229 230
 *          (new_end - sector_num) if tail is rounded up or down due to
 *          alignment or buffer limit.
 */
static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                          int nb_sectors)
{
K
Kevin Wolf 已提交
231
    BlockBackend *source = s->common.blk;
F
Fam Zheng 已提交
232
    int sectors_per_chunk, nb_chunks;
233
    int ret;
F
Fam Zheng 已提交
234
    MirrorOp *op;
235
    int max_sectors;
F
Fam Zheng 已提交
236

237
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
238
    max_sectors = sectors_per_chunk * s->max_iov;
239

F
Fam Zheng 已提交
240 241
    /* We can only handle as much as buf_size at a time. */
    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
242
    nb_sectors = MIN(max_sectors, nb_sectors);
F
Fam Zheng 已提交
243
    assert(nb_sectors);
244
    ret = nb_sectors;
245

F
Fam Zheng 已提交
246 247 248 249 250 251 252 253
    if (s->cow_bitmap) {
        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
    }
    assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size);
    /* The sector range must meet granularity because:
     * 1) Caller passes in aligned values;
     * 2) mirror_cow_align is used only when target cluster is larger. */
    assert(!(sector_num % sectors_per_chunk));
254
    nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk);
F
Fam Zheng 已提交
255 256

    while (s->buf_free_count < nb_chunks) {
257
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
F
Fam Zheng 已提交
258
        mirror_wait_for_io(s);
259 260
    }

261
    /* Allocate a MirrorOp that is used as an AIO callback.  */
262
    op = g_new(MirrorOp, 1);
263 264 265
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;
266 267 268 269 270 271 272

    /* Now make a QEMUIOVector taking enough granularity-sized chunks
     * from s->buf_free.
     */
    qemu_iovec_init(&op->qiov, nb_chunks);
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
F
Fam Zheng 已提交
273
        size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size;
274

275 276
        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
277
        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
278
    }
279

P
Paolo Bonzini 已提交
280
    /* Copy the dirty cluster.  */
281
    s->in_flight++;
M
Max Reitz 已提交
282
    s->sectors_in_flight += nb_sectors;
283
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
284

285
    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
F
Fam Zheng 已提交
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
                   mirror_read_complete, op);
    return ret;
}

static void mirror_do_zero_or_discard(MirrorBlockJob *s,
                                      int64_t sector_num,
                                      int nb_sectors,
                                      bool is_discard)
{
    MirrorOp *op;

    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
     * so the freeing in mirror_iteration_done is nop. */
    op = g_new0(MirrorOp, 1);
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;

    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    if (is_discard) {
K
Kevin Wolf 已提交
307 308
        blk_aio_discard(s->target, sector_num, op->nb_sectors,
                        mirror_write_complete, op);
F
Fam Zheng 已提交
309
    } else {
K
Kevin Wolf 已提交
310 311
        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
                              op->nb_sectors * BDRV_SECTOR_SIZE,
312 313
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
F
Fam Zheng 已提交
314 315 316 317 318
    }
}

static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
{
K
Kevin Wolf 已提交
319
    BlockDriverState *source = blk_bs(s->common.blk);
320
    int64_t sector_num, first_chunk;
F
Fam Zheng 已提交
321 322 323 324 325 326 327 328 329 330 331 332 333 334
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
    int nb_chunks = 1;
    int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
    int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;

    sector_num = hbitmap_iter_next(&s->hbi);
    if (sector_num < 0) {
        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
        sector_num = hbitmap_iter_next(&s->hbi);
        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
        assert(sector_num >= 0);
    }

335 336
    first_chunk = sector_num / sectors_per_chunk;
    while (test_bit(first_chunk, s->in_flight_bitmap)) {
337
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
338 339 340
        mirror_wait_for_io(s);
    }

341 342
    block_job_pause_point(&s->common);

F
Fam Zheng 已提交
343 344 345 346 347 348 349 350 351 352 353
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
        int64_t hbitmap_next;
        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
        int64_t next_chunk = next_sector / sectors_per_chunk;
        if (next_sector >= end ||
            !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
            break;
        }
        if (test_bit(next_chunk, s->in_flight_bitmap)) {
354
            break;
F
Fam Zheng 已提交
355
        }
356 357

        hbitmap_next = hbitmap_iter_next(&s->hbi);
358 359 360 361 362
        if (hbitmap_next > next_sector || hbitmap_next < 0) {
            /* The bitmap iterator's cache is stale, refresh it */
            bdrv_set_dirty_iter(&s->hbi, next_sector);
            hbitmap_next = hbitmap_iter_next(&s->hbi);
        }
363 364
        assert(hbitmap_next == next_sector);
        nb_chunks++;
F
Fam Zheng 已提交
365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
    }

    /* Clear dirty bits before querying the block status, because
     * calling bdrv_get_block_status_above could yield - if some blocks are
     * marked dirty in this window, we need to know.
     */
    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num,
                            nb_chunks * sectors_per_chunk);
    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
    while (nb_chunks > 0 && sector_num < end) {
        int ret;
        int io_sectors;
        BlockDriverState *file;
        enum MirrorMethod {
            MIRROR_METHOD_COPY,
            MIRROR_METHOD_ZERO,
            MIRROR_METHOD_DISCARD
        } mirror_method = MIRROR_METHOD_COPY;

        assert(!(sector_num % sectors_per_chunk));
        ret = bdrv_get_block_status_above(source, NULL, sector_num,
                                          nb_chunks * sectors_per_chunk,
                                          &io_sectors, &file);
        if (ret < 0) {
            io_sectors = nb_chunks * sectors_per_chunk;
        }

        io_sectors -= io_sectors % sectors_per_chunk;
        if (io_sectors < sectors_per_chunk) {
            io_sectors = sectors_per_chunk;
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_sector_num;
            int target_nb_sectors;
398 399 400
            bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
                                           io_sectors,  &target_sector_num,
                                           &target_nb_sectors);
F
Fam Zheng 已提交
401 402 403 404 405 406 407 408
            if (target_sector_num == sector_num &&
                target_nb_sectors == io_sectors) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
                                    MIRROR_METHOD_ZERO :
                                    MIRROR_METHOD_DISCARD;
            }
        }

409
        mirror_clip_sectors(s, sector_num, &io_sectors);
F
Fam Zheng 已提交
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
        switch (mirror_method) {
        case MIRROR_METHOD_COPY:
            io_sectors = mirror_do_read(s, sector_num, io_sectors);
            break;
        case MIRROR_METHOD_ZERO:
            mirror_do_zero_or_discard(s, sector_num, io_sectors, false);
            break;
        case MIRROR_METHOD_DISCARD:
            mirror_do_zero_or_discard(s, sector_num, io_sectors, true);
            break;
        default:
            abort();
        }
        assert(io_sectors);
        sector_num += io_sectors;
425
        nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
426 427 428
        if (s->common.speed) {
            delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors);
        }
429
    }
430
    return delay_ns;
431
}
432

433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
static void mirror_free_init(MirrorBlockJob *s)
{
    int granularity = s->granularity;
    size_t buf_size = s->buf_size;
    uint8_t *buf = s->buf;

    assert(s->buf_free_count == 0);
    QSIMPLEQ_INIT(&s->buf_free);
    while (buf_size != 0) {
        MirrorBuffer *cur = (MirrorBuffer *)buf;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
        s->buf_free_count++;
        buf_size -= granularity;
        buf += granularity;
    }
}

450 451 452
static void mirror_drain(MirrorBlockJob *s)
{
    while (s->in_flight > 0) {
F
Fam Zheng 已提交
453
        mirror_wait_for_io(s);
454
    }
P
Paolo Bonzini 已提交
455 456
}

457 458 459 460 461 462 463 464 465
typedef struct {
    int ret;
} MirrorExitData;

static void mirror_exit(BlockJob *job, void *opaque)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
K
Kevin Wolf 已提交
466 467
    BlockDriverState *src = blk_bs(s->common.blk);
    BlockDriverState *target_bs = blk_bs(s->target);
468 469 470 471

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
    bdrv_ref(src);
472 473 474 475 476 477 478

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);
    }

    if (s->should_complete && data->ret == 0) {
K
Kevin Wolf 已提交
479
        BlockDriverState *to_replace = src;
480 481 482
        if (s->to_replace) {
            to_replace = s->to_replace;
        }
483

K
Kevin Wolf 已提交
484 485
        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
486
        }
487 488 489

        /* The mirror job has no requests in flight any more, but we need to
         * drain potential other users of the BDS before changing the graph. */
K
Kevin Wolf 已提交
490 491 492
        bdrv_drained_begin(target_bs);
        bdrv_replace_in_backing_chain(to_replace, target_bs);
        bdrv_drained_end(target_bs);
493

494 495 496
        /* We just changed the BDS the job BB refers to */
        blk_remove_bs(job->blk);
        blk_insert_bs(job->blk, src);
497 498 499 500 501 502 503 504 505 506
    }
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
        bdrv_unref(s->to_replace);
    }
    if (replace_aio_context) {
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
K
Kevin Wolf 已提交
507 508
    bdrv_op_unblock_all(target_bs, s->common.blocker);
    blk_unref(s->target);
509 510
    block_job_completed(&s->common, data->ret);
    g_free(data);
511
    bdrv_drained_end(src);
512 513 514
    if (qemu_get_aio_context() == bdrv_get_aio_context(src)) {
        aio_enable_external(iohandler_get_aio_context());
    }
515
    bdrv_unref(src);
516 517
}

518 519 520 521 522 523 524 525 526 527 528 529
static void mirror_throttle(MirrorBlockJob *s)
{
    int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);

    if (now - s->last_pause_ns > SLICE_TIME) {
        s->last_pause_ns = now;
        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
    } else {
        block_job_pause_point(&s->common);
    }
}

530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566
static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
{
    int64_t sector_num, end;
    BlockDriverState *base = s->base;
    BlockDriverState *bs = blk_bs(s->common.blk);
    BlockDriverState *target_bs = blk_bs(s->target);
    bool mark_all_dirty = base == NULL && !bdrv_has_zero_init(target_bs);
    int ret, n;

    end = s->bdev_length / BDRV_SECTOR_SIZE;

    /* First part, loop on the sectors and initialize the dirty bitmap.  */
    for (sector_num = 0; sector_num < end; ) {
        /* Just to make sure we are not exceeding int limit. */
        int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
                             end - sector_num);

        mirror_throttle(s);

        if (block_job_is_cancelled(&s->common)) {
            return 0;
        }

        ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
        if (ret < 0) {
            return ret;
        }

        assert(n > 0);
        if (ret == 1 || mark_all_dirty) {
            bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
        }
        sector_num += n;
    }
    return 0;
}

P
Paolo Bonzini 已提交
567 568 569
static void coroutine_fn mirror_run(void *opaque)
{
    MirrorBlockJob *s = opaque;
570
    MirrorExitData *data;
K
Kevin Wolf 已提交
571 572
    BlockDriverState *bs = blk_bs(s->common.blk);
    BlockDriverState *target_bs = blk_bs(s->target);
573
    int64_t length;
574
    BlockDriverInfo bdi;
575 576
    char backing_filename[2]; /* we only need 2 characters because we are only
                                 checking for a NULL string */
P
Paolo Bonzini 已提交
577
    int ret = 0;
F
Fam Zheng 已提交
578
    int target_cluster_size = BDRV_SECTOR_SIZE;
P
Paolo Bonzini 已提交
579 580 581 582 583

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
    }

M
Max Reitz 已提交
584 585 586
    s->bdev_length = bdrv_getlength(bs);
    if (s->bdev_length < 0) {
        ret = s->bdev_length;
587
        goto immediate_exit;
M
Max Reitz 已提交
588
    } else if (s->bdev_length == 0) {
589 590 591 592 593 594 595 596
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
        while (!block_job_is_cancelled(&s->common) && !s->should_complete) {
            block_job_yield(&s->common);
        }
        s->common.cancelled = false;
        goto immediate_exit;
P
Paolo Bonzini 已提交
597 598
    }

M
Max Reitz 已提交
599
    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
600 601
    s->in_flight_bitmap = bitmap_new(length);

602 603 604 605
    /* If we have no backing file yet in the destination, we cannot let
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
K
Kevin Wolf 已提交
606
    bdrv_get_backing_filename(target_bs, backing_filename,
607
                              sizeof(backing_filename));
K
Kevin Wolf 已提交
608
    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
F
Fam Zheng 已提交
609 610
        target_cluster_size = bdi.cluster_size;
    }
K
Kevin Wolf 已提交
611
    if (backing_filename[0] && !target_bs->backing
F
Fam Zheng 已提交
612 613 614
        && s->granularity < target_cluster_size) {
        s->buf_size = MAX(s->buf_size, target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
615
    }
F
Fam Zheng 已提交
616
    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
K
Kevin Wolf 已提交
617
    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
618

619 620 621 622 623 624
    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
        ret = -ENOMEM;
        goto immediate_exit;
    }

625
    mirror_free_init(s);
P
Paolo Bonzini 已提交
626

627
    s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
F
Fam Zheng 已提交
628
    if (!s->is_none_mode) {
629 630 631
        ret = mirror_dirty_init(s);
        if (ret < 0 || block_job_is_cancelled(&s->common)) {
            goto immediate_exit;
P
Paolo Bonzini 已提交
632 633 634
        }
    }

635
    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
P
Paolo Bonzini 已提交
636
    for (;;) {
637
        uint64_t delay_ns = 0;
638
        int64_t cnt, delta;
P
Paolo Bonzini 已提交
639 640
        bool should_complete;

641 642 643 644 645
        if (s->ret < 0) {
            ret = s->ret;
            goto immediate_exit;
        }

646 647
        block_job_pause_point(&s->common);

648
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
M
Max Reitz 已提交
649 650 651 652 653 654
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
         * s->sectors_in_flight is the number of sectors currently being
         * processed; together those are the current total operation length */
        s->common.len = s->common.offset +
                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
655 656

        /* Note that even when no rate limit is applied we need to yield
657
         * periodically with no pending I/O so that bdrv_drain_all() returns.
658 659 660
         * We do so every SLICE_TIME nanoseconds, or when there is an error,
         * or when the source is clean, whichever comes first.
         */
661 662
        delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
        if (delta < SLICE_TIME &&
663
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
664 665 666
            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                (cnt == 0 && s->in_flight > 0)) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
F
Fam Zheng 已提交
667
                mirror_wait_for_io(s);
668 669
                continue;
            } else if (cnt != 0) {
670
                delay_ns = mirror_iteration(s);
P
Paolo Bonzini 已提交
671 672 673 674
            }
        }

        should_complete = false;
675
        if (s->in_flight == 0 && cnt == 0) {
P
Paolo Bonzini 已提交
676
            trace_mirror_before_flush(s);
K
Kevin Wolf 已提交
677
            ret = blk_flush(s->target);
P
Paolo Bonzini 已提交
678
            if (ret < 0) {
W
Wenchao Xia 已提交
679 680
                if (mirror_error_action(s, false, -ret) ==
                    BLOCK_ERROR_ACTION_REPORT) {
681 682 683 684 685 686 687 688 689
                    goto immediate_exit;
                }
            } else {
                /* We're out of the streaming phase.  From now on, if the job
                 * is cancelled we will actually complete all pending I/O and
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
                if (!s->synced) {
690
                    block_job_event_ready(&s->common);
691 692 693 694 695
                    s->synced = true;
                }

                should_complete = s->should_complete ||
                    block_job_is_cancelled(&s->common);
696
                cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
697
            }
P
Paolo Bonzini 已提交
698 699 700 701 702 703 704 705 706 707 708 709
        }

        if (cnt == 0 && should_complete) {
            /* The dirty bitmap is not updated while operations are pending.
             * If we're about to exit, wait for pending operations before
             * calling bdrv_get_dirty_count(bs), or we may exit while the
             * source has dirty data to copy!
             *
             * Note that I/O can be submitted by the guest while
             * mirror_populate runs.
             */
            trace_mirror_before_drain(s, cnt);
710
            bdrv_co_drain(bs);
711
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
712 713 714
        }

        ret = 0;
715
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
P
Paolo Bonzini 已提交
716
        if (!s->synced) {
717
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
718 719 720 721
            if (block_job_is_cancelled(&s->common)) {
                break;
            }
        } else if (!should_complete) {
722
            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
723
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
724 725 726 727 728 729 730 731
        } else if (cnt == 0) {
            /* The two disks are in sync.  Exit and report successful
             * completion.
             */
            assert(QLIST_EMPTY(&bs->tracked_requests));
            s->common.cancelled = false;
            break;
        }
732
        s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
P
Paolo Bonzini 已提交
733 734 735
    }

immediate_exit:
736 737 738 739 740 741 742 743 744 745
    if (s->in_flight > 0) {
        /* We get here only if something went wrong.  Either the job failed,
         * or it was cancelled prematurely so that we do not guarantee that
         * the target is a copy of the source.
         */
        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
        mirror_drain(s);
    }

    assert(s->in_flight == 0);
746
    qemu_vfree(s->buf);
747
    g_free(s->cow_bitmap);
748
    g_free(s->in_flight_bitmap);
F
Fam Zheng 已提交
749
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
750 751 752

    data = g_malloc(sizeof(*data));
    data->ret = ret;
753 754
    /* Before we switch to target in mirror_exit, make sure data doesn't
     * change. */
K
Kevin Wolf 已提交
755
    bdrv_drained_begin(bs);
756 757 758 759 760 761
    if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) {
        /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the
         * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we
         * need a block layer API change to achieve this. */
        aio_disable_external(iohandler_get_aio_context());
    }
762
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
P
Paolo Bonzini 已提交
763 764 765 766 767 768 769
}

static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    if (speed < 0) {
770
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
P
Paolo Bonzini 已提交
771 772 773 774 775
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

P
Paolo Bonzini 已提交
776 777 778
static void mirror_complete(BlockJob *job, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
M
Max Reitz 已提交
779 780 781 782
    BlockDriverState *src, *target;

    src = blk_bs(job->blk);
    target = blk_bs(s->target);
P
Paolo Bonzini 已提交
783 784

    if (!s->synced) {
785 786
        error_setg(errp, "The active block job '%s' cannot be completed",
                   job->id);
P
Paolo Bonzini 已提交
787 788 789
        return;
    }

M
Max Reitz 已提交
790 791 792 793 794 795 796 797 798 799
    if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
        int ret;

        assert(!target->backing);
        ret = bdrv_open_backing_file(target, NULL, "backing", errp);
        if (ret < 0) {
            return;
        }
    }

C
Changlong Xie 已提交
800
    /* block all operations on to_replace bs */
801
    if (s->replaces) {
802 803
        AioContext *replace_aio_context;

804
        s->to_replace = bdrv_find_node(s->replaces);
805
        if (!s->to_replace) {
806
            error_setg(errp, "Node name '%s' not found", s->replaces);
807 808 809
            return;
        }

810 811 812
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);

813 814 815 816
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
817 818

        aio_context_release(replace_aio_context);
819 820
    }

M
Max Reitz 已提交
821 822 823 824 825 826 827
    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
        BlockDriverState *backing = s->is_none_mode ? src : s->base;
        if (backing_bs(target) != backing) {
            bdrv_set_backing_hd(target, backing);
        }
    }

P
Paolo Bonzini 已提交
828
    s->should_complete = true;
F
Fam Zheng 已提交
829
    block_job_enter(&s->common);
P
Paolo Bonzini 已提交
830 831
}

832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848
/* There is no matching mirror_resume() because mirror_run() will begin
 * iterating again when the job is resumed.
 */
static void coroutine_fn mirror_pause(BlockJob *job)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    mirror_drain(s);
}

static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    blk_set_aio_context(s->target, new_context);
}

849
static const BlockJobDriver mirror_job_driver = {
850 851 852 853 854 855
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_MIRROR,
    .set_speed              = mirror_set_speed,
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
P
Paolo Bonzini 已提交
856 857
};

F
Fam Zheng 已提交
858
static const BlockJobDriver commit_active_job_driver = {
859 860 861 862 863 864
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_COMMIT,
    .set_speed              = mirror_set_speed,
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
F
Fam Zheng 已提交
865 866
};

867 868
static void mirror_start_job(const char *job_id, BlockDriverState *bs,
                             BlockDriverState *target, const char *replaces,
869
                             int64_t speed, uint32_t granularity,
870
                             int64_t buf_size,
M
Max Reitz 已提交
871
                             BlockMirrorBackingMode backing_mode,
872 873
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
874
                             bool unmap,
875
                             BlockCompletionFunc *cb,
876 877 878
                             void *opaque, Error **errp,
                             const BlockJobDriver *driver,
                             bool is_none_mode, BlockDriverState *base)
P
Paolo Bonzini 已提交
879 880 881
{
    MirrorBlockJob *s;

882
    if (granularity == 0) {
883
        granularity = bdrv_get_default_bitmap_granularity(target);
884 885 886 887
    }

    assert ((granularity & (granularity - 1)) == 0);

W
Wen Congyang 已提交
888 889 890 891 892 893 894 895
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
    }

    if (buf_size == 0) {
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }
F
Fam Zheng 已提交
896

897
    s = block_job_create(job_id, driver, bs, speed, cb, opaque, errp);
P
Paolo Bonzini 已提交
898 899 900 901
    if (!s) {
        return;
    }

K
Kevin Wolf 已提交
902 903 904
    s->target = blk_new();
    blk_insert_bs(s->target, target);

905
    s->replaces = g_strdup(replaces);
906 907
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
F
Fam Zheng 已提交
908
    s->is_none_mode = is_none_mode;
M
Max Reitz 已提交
909
    s->backing_mode = backing_mode;
F
Fam Zheng 已提交
910
    s->base = base;
911
    s->granularity = granularity;
W
Wen Congyang 已提交
912
    s->buf_size = ROUND_UP(buf_size, granularity);
913
    s->unmap = unmap;
914

915
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
916
    if (!s->dirty_bitmap) {
917
        g_free(s->replaces);
K
Kevin Wolf 已提交
918
        blk_unref(s->target);
919
        block_job_unref(&s->common);
920 921
        return;
    }
922

K
Kevin Wolf 已提交
923
    bdrv_op_block_all(target, s->common.blocker);
924

925
    s->common.co = qemu_coroutine_create(mirror_run, s);
P
Paolo Bonzini 已提交
926
    trace_mirror_start(bs, s, s->common.co, opaque);
927
    qemu_coroutine_enter(s->common.co);
P
Paolo Bonzini 已提交
928
}
F
Fam Zheng 已提交
929

930 931
void mirror_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *target, const char *replaces,
932
                  int64_t speed, uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
933 934
                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                  BlockdevOnError on_source_error,
F
Fam Zheng 已提交
935
                  BlockdevOnError on_target_error,
936
                  bool unmap,
937
                  BlockCompletionFunc *cb,
F
Fam Zheng 已提交
938 939 940 941 942
                  void *opaque, Error **errp)
{
    bool is_none_mode;
    BlockDriverState *base;

943 944
    if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        error_setg(errp, "Sync mode 'incremental' not supported");
945 946
        return;
    }
F
Fam Zheng 已提交
947
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
948
    base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
949
    mirror_start_job(job_id, bs, target, replaces,
M
Max Reitz 已提交
950
                     speed, granularity, buf_size, backing_mode,
951
                     on_source_error, on_target_error, unmap, cb, opaque, errp,
F
Fam Zheng 已提交
952 953 954
                     &mirror_job_driver, is_none_mode, base);
}

955 956
void commit_active_start(const char *job_id, BlockDriverState *bs,
                         BlockDriverState *base, int64_t speed,
F
Fam Zheng 已提交
957
                         BlockdevOnError on_error,
958
                         BlockCompletionFunc *cb,
F
Fam Zheng 已提交
959 960
                         void *opaque, Error **errp)
{
961 962
    int64_t length, base_length;
    int orig_base_flags;
963
    int ret;
964
    Error *local_err = NULL;
965 966 967

    orig_base_flags = bdrv_get_flags(base);

F
Fam Zheng 已提交
968 969 970
    if (bdrv_reopen(base, bs->open_flags, errp)) {
        return;
    }
971 972 973

    length = bdrv_getlength(bs);
    if (length < 0) {
974 975
        error_setg_errno(errp, -length,
                         "Unable to determine length of %s", bs->filename);
976 977 978 979 980
        goto error_restore_flags;
    }

    base_length = bdrv_getlength(base);
    if (base_length < 0) {
981 982
        error_setg_errno(errp, -base_length,
                         "Unable to determine length of %s", base->filename);
983 984 985 986
        goto error_restore_flags;
    }

    if (length > base_length) {
987 988 989 990
        ret = bdrv_truncate(base, length);
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                            "Top image %s is larger than base image %s, and "
991 992 993 994 995 996
                             "resize of base image failed",
                             bs->filename, base->filename);
            goto error_restore_flags;
        }
    }

997
    mirror_start_job(job_id, bs, base, NULL, speed, 0, 0,
998
                     MIRROR_LEAVE_BACKING_CHAIN,
999
                     on_error, on_error, false, cb, opaque, &local_err,
F
Fam Zheng 已提交
1000
                     &commit_active_job_driver, false, base);
1001
    if (local_err) {
1002
        error_propagate(errp, local_err);
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
        goto error_restore_flags;
    }

    return;

error_restore_flags:
    /* ignore error and errp for bdrv_reopen, because we want to propagate
     * the original error */
    bdrv_reopen(base, orig_base_flags, NULL);
    return;
F
Fam Zheng 已提交
1013
}