mirror.c 43.0 KB
Newer Older
P
Paolo Bonzini 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Image mirroring
 *
 * Copyright Red Hat, Inc. 2012
 *
 * Authors:
 *  Paolo Bonzini  <pbonzini@redhat.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
14
#include "qemu/osdep.h"
P
Paolo Bonzini 已提交
15
#include "trace.h"
16
#include "block/blockjob_int.h"
17
#include "block/block_int.h"
18
#include "sysemu/block-backend.h"
19
#include "qapi/error.h"
20
#include "qapi/qmp/qerror.h"
P
Paolo Bonzini 已提交
21
#include "qemu/ratelimit.h"
22
#include "qemu/bitmap.h"
P
Paolo Bonzini 已提交
23

24 25
#define SLICE_TIME    100000000ULL /* ns */
#define MAX_IN_FLIGHT 16
26 27 28
#define MAX_IO_SECTORS ((1 << 20) >> BDRV_SECTOR_BITS) /* 1 Mb */
#define DEFAULT_MIRROR_BUF_SIZE \
    (MAX_IN_FLIGHT * MAX_IO_SECTORS * BDRV_SECTOR_SIZE)
29 30 31 32 33 34 35

/* The mirroring buffer is a list of granularity-sized chunks.
 * Free chunks are organized in a list.
 */
typedef struct MirrorBuffer {
    QSIMPLEQ_ENTRY(MirrorBuffer) next;
} MirrorBuffer;
P
Paolo Bonzini 已提交
36 37 38 39

typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
K
Kevin Wolf 已提交
40
    BlockBackend *target;
41 42
    BlockDriverState *mirror_top_bs;
    BlockDriverState *source;
F
Fam Zheng 已提交
43
    BlockDriverState *base;
44

45 46 47 48 49 50
    /* The name of the graph node to replace */
    char *replaces;
    /* The BDS to replace */
    BlockDriverState *to_replace;
    /* Used to block operations on the drive-mirror-replace target */
    Error *replace_blocker;
F
Fam Zheng 已提交
51
    bool is_none_mode;
M
Max Reitz 已提交
52
    BlockMirrorBackingMode backing_mode;
53
    BlockdevOnError on_source_error, on_target_error;
P
Paolo Bonzini 已提交
54 55
    bool synced;
    bool should_complete;
56
    int64_t granularity;
57
    size_t buf_size;
M
Max Reitz 已提交
58
    int64_t bdev_length;
59
    unsigned long *cow_bitmap;
F
Fam Zheng 已提交
60
    BdrvDirtyBitmap *dirty_bitmap;
61
    BdrvDirtyBitmapIter *dbi;
P
Paolo Bonzini 已提交
62
    uint8_t *buf;
63 64
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;
65

66
    uint64_t last_pause_ns;
67
    unsigned long *in_flight_bitmap;
68
    int in_flight;
69
    int64_t sectors_in_flight;
70
    int ret;
71
    bool unmap;
K
Kevin Wolf 已提交
72
    bool waiting_for_io;
F
Fam Zheng 已提交
73 74
    int target_cluster_sectors;
    int max_iov;
75
    bool initial_zeroing_ongoing;
P
Paolo Bonzini 已提交
76 77
} MirrorBlockJob;

78 79 80 81 82 83 84
typedef struct MirrorOp {
    MirrorBlockJob *s;
    QEMUIOVector qiov;
    int64_t sector_num;
    int nb_sectors;
} MirrorOp;

85 86 87 88 89
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                            int error)
{
    s->synced = false;
    if (read) {
90 91
        return block_job_error_action(&s->common, s->on_source_error,
                                      true, error);
92
    } else {
93 94
        return block_job_error_action(&s->common, s->on_target_error,
                                      false, error);
95 96 97
    }
}

98 99 100
static void mirror_iteration_done(MirrorOp *op, int ret)
{
    MirrorBlockJob *s = op->s;
101
    struct iovec *iov;
102
    int64_t chunk_num;
103
    int i, nb_chunks, sectors_per_chunk;
104 105 106 107

    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
M
Max Reitz 已提交
108
    s->sectors_in_flight -= op->nb_sectors;
109 110 111 112 113 114 115
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
        s->buf_free_count++;
    }

116 117
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    chunk_num = op->sector_num / sectors_per_chunk;
118
    nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk);
119
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
M
Max Reitz 已提交
120 121 122 123
    if (ret >= 0) {
        if (s->cow_bitmap) {
            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
        }
124 125 126
        if (!s->initial_zeroing_ongoing) {
            s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
        }
127
    }
Z
Zhang Min 已提交
128
    qemu_iovec_destroy(&op->qiov);
129
    g_free(op);
130

K
Kevin Wolf 已提交
131
    if (s->waiting_for_io) {
132
        qemu_coroutine_enter(s->common.co);
133
    }
134 135 136 137 138 139
}

static void mirror_write_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
140 141

    aio_context_acquire(blk_get_aio_context(s->common.blk));
142 143 144
    if (ret < 0) {
        BlockErrorAction action;

145
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
146
        action = mirror_error_action(s, false, -ret);
W
Wenchao Xia 已提交
147
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
148 149 150 151
            s->ret = ret;
        }
    }
    mirror_iteration_done(op, ret);
152
    aio_context_release(blk_get_aio_context(s->common.blk));
153 154 155 156 157 158
}

static void mirror_read_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
159 160

    aio_context_acquire(blk_get_aio_context(s->common.blk));
161 162 163
    if (ret < 0) {
        BlockErrorAction action;

164
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
165
        action = mirror_error_action(s, true, -ret);
W
Wenchao Xia 已提交
166
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
167 168 169 170
            s->ret = ret;
        }

        mirror_iteration_done(op, ret);
171 172 173
    } else {
        blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
                        0, mirror_write_complete, op);
174
    }
175
    aio_context_release(blk_get_aio_context(s->common.blk));
176 177
}

178 179 180 181 182 183 184 185
static inline void mirror_clip_sectors(MirrorBlockJob *s,
                                       int64_t sector_num,
                                       int *nb_sectors)
{
    *nb_sectors = MIN(*nb_sectors,
                      s->bdev_length / BDRV_SECTOR_SIZE - sector_num);
}

F
Fam Zheng 已提交
186 187 188 189 190
/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
 * return the offset of the adjusted tail sector against original. */
static int mirror_cow_align(MirrorBlockJob *s,
                            int64_t *sector_num,
                            int *nb_sectors)
P
Paolo Bonzini 已提交
191
{
F
Fam Zheng 已提交
192 193 194 195 196 197 198 199 200 201 202
    bool need_cow;
    int ret = 0;
    int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS;
    int64_t align_sector_num = *sector_num;
    int align_nb_sectors = *nb_sectors;
    int max_sectors = chunk_sectors * s->max_iov;

    need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap);
    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                          s->cow_bitmap);
    if (need_cow) {
203 204 205
        bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
                                       *nb_sectors, &align_sector_num,
                                       &align_nb_sectors);
F
Fam Zheng 已提交
206
    }
207

F
Fam Zheng 已提交
208 209 210 211 212 213
    if (align_nb_sectors > max_sectors) {
        align_nb_sectors = max_sectors;
        if (need_cow) {
            align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors,
                                               s->target_cluster_sectors);
        }
214
    }
215 216 217
    /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but
     * that doesn't matter because it's already the end of source image. */
    mirror_clip_sectors(s, align_sector_num, &align_nb_sectors);
218

F
Fam Zheng 已提交
219 220 221 222 223 224 225
    ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
    *sector_num = align_sector_num;
    *nb_sectors = align_nb_sectors;
    assert(ret >= 0);
    return ret;
}

F
Fam Zheng 已提交
226 227 228 229 230 231 232 233
static inline void mirror_wait_for_io(MirrorBlockJob *s)
{
    assert(!s->waiting_for_io);
    s->waiting_for_io = true;
    qemu_coroutine_yield();
    s->waiting_for_io = false;
}

F
Fam Zheng 已提交
234
/* Submit async read while handling COW.
235 236 237
 * Returns: The number of sectors copied after and including sector_num,
 *          excluding any sectors copied prior to sector_num due to alignment.
 *          This will be nb_sectors if no alignment is necessary, or
F
Fam Zheng 已提交
238 239 240 241 242 243
 *          (new_end - sector_num) if tail is rounded up or down due to
 *          alignment or buffer limit.
 */
static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                          int nb_sectors)
{
K
Kevin Wolf 已提交
244
    BlockBackend *source = s->common.blk;
F
Fam Zheng 已提交
245
    int sectors_per_chunk, nb_chunks;
246
    int ret;
F
Fam Zheng 已提交
247
    MirrorOp *op;
248
    int max_sectors;
F
Fam Zheng 已提交
249

250
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
251
    max_sectors = sectors_per_chunk * s->max_iov;
252

F
Fam Zheng 已提交
253 254
    /* We can only handle as much as buf_size at a time. */
    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
255
    nb_sectors = MIN(max_sectors, nb_sectors);
F
Fam Zheng 已提交
256
    assert(nb_sectors);
257
    ret = nb_sectors;
258

F
Fam Zheng 已提交
259 260 261 262 263 264 265 266
    if (s->cow_bitmap) {
        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
    }
    assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size);
    /* The sector range must meet granularity because:
     * 1) Caller passes in aligned values;
     * 2) mirror_cow_align is used only when target cluster is larger. */
    assert(!(sector_num % sectors_per_chunk));
267
    nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk);
F
Fam Zheng 已提交
268 269

    while (s->buf_free_count < nb_chunks) {
270
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
F
Fam Zheng 已提交
271
        mirror_wait_for_io(s);
272 273
    }

274
    /* Allocate a MirrorOp that is used as an AIO callback.  */
275
    op = g_new(MirrorOp, 1);
276 277 278
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;
279 280 281 282 283 284 285

    /* Now make a QEMUIOVector taking enough granularity-sized chunks
     * from s->buf_free.
     */
    qemu_iovec_init(&op->qiov, nb_chunks);
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
F
Fam Zheng 已提交
286
        size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size;
287

288 289
        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
290
        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
291
    }
292

P
Paolo Bonzini 已提交
293
    /* Copy the dirty cluster.  */
294
    s->in_flight++;
M
Max Reitz 已提交
295
    s->sectors_in_flight += nb_sectors;
296
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
297

298
    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
F
Fam Zheng 已提交
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
                   mirror_read_complete, op);
    return ret;
}

static void mirror_do_zero_or_discard(MirrorBlockJob *s,
                                      int64_t sector_num,
                                      int nb_sectors,
                                      bool is_discard)
{
    MirrorOp *op;

    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
     * so the freeing in mirror_iteration_done is nop. */
    op = g_new0(MirrorOp, 1);
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;

    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    if (is_discard) {
320 321 322
        blk_aio_pdiscard(s->target, sector_num << BDRV_SECTOR_BITS,
                         op->nb_sectors << BDRV_SECTOR_BITS,
                         mirror_write_complete, op);
F
Fam Zheng 已提交
323
    } else {
K
Kevin Wolf 已提交
324 325
        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
                              op->nb_sectors * BDRV_SECTOR_SIZE,
326 327
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
F
Fam Zheng 已提交
328 329 330 331 332
    }
}

static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
{
333
    BlockDriverState *source = s->source;
334
    int64_t sector_num, first_chunk;
F
Fam Zheng 已提交
335 336 337 338 339
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
    int nb_chunks = 1;
    int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
    int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
340
    bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
341 342
    int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT,
                             MAX_IO_SECTORS);
F
Fam Zheng 已提交
343

344
    sector_num = bdrv_dirty_iter_next(s->dbi);
F
Fam Zheng 已提交
345
    if (sector_num < 0) {
346 347
        bdrv_set_dirty_iter(s->dbi, 0);
        sector_num = bdrv_dirty_iter_next(s->dbi);
F
Fam Zheng 已提交
348 349 350 351
        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
        assert(sector_num >= 0);
    }

352 353
    first_chunk = sector_num / sectors_per_chunk;
    while (test_bit(first_chunk, s->in_flight_bitmap)) {
354
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
355 356 357
        mirror_wait_for_io(s);
    }

358 359
    block_job_pause_point(&s->common);

F
Fam Zheng 已提交
360 361 362
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
363
        int64_t next_dirty;
F
Fam Zheng 已提交
364 365 366 367 368 369 370
        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
        int64_t next_chunk = next_sector / sectors_per_chunk;
        if (next_sector >= end ||
            !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
            break;
        }
        if (test_bit(next_chunk, s->in_flight_bitmap)) {
371
            break;
F
Fam Zheng 已提交
372
        }
373

374 375
        next_dirty = bdrv_dirty_iter_next(s->dbi);
        if (next_dirty > next_sector || next_dirty < 0) {
376
            /* The bitmap iterator's cache is stale, refresh it */
377 378
            bdrv_set_dirty_iter(s->dbi, next_sector);
            next_dirty = bdrv_dirty_iter_next(s->dbi);
379
        }
380
        assert(next_dirty == next_sector);
381
        nb_chunks++;
F
Fam Zheng 已提交
382 383 384 385 386 387 388 389 390 391
    }

    /* Clear dirty bits before querying the block status, because
     * calling bdrv_get_block_status_above could yield - if some blocks are
     * marked dirty in this window, we need to know.
     */
    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num,
                            nb_chunks * sectors_per_chunk);
    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
    while (nb_chunks > 0 && sector_num < end) {
392
        int64_t ret;
393
        int io_sectors, io_sectors_acct;
F
Fam Zheng 已提交
394 395 396 397 398 399 400 401 402 403 404 405
        BlockDriverState *file;
        enum MirrorMethod {
            MIRROR_METHOD_COPY,
            MIRROR_METHOD_ZERO,
            MIRROR_METHOD_DISCARD
        } mirror_method = MIRROR_METHOD_COPY;

        assert(!(sector_num % sectors_per_chunk));
        ret = bdrv_get_block_status_above(source, NULL, sector_num,
                                          nb_chunks * sectors_per_chunk,
                                          &io_sectors, &file);
        if (ret < 0) {
406 407 408
            io_sectors = MIN(nb_chunks * sectors_per_chunk, max_io_sectors);
        } else if (ret & BDRV_BLOCK_DATA) {
            io_sectors = MIN(io_sectors, max_io_sectors);
F
Fam Zheng 已提交
409 410 411 412 413 414 415 416
        }

        io_sectors -= io_sectors % sectors_per_chunk;
        if (io_sectors < sectors_per_chunk) {
            io_sectors = sectors_per_chunk;
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_sector_num;
            int target_nb_sectors;
417 418 419
            bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
                                           io_sectors,  &target_sector_num,
                                           &target_nb_sectors);
F
Fam Zheng 已提交
420 421 422 423 424 425 426 427
            if (target_sector_num == sector_num &&
                target_nb_sectors == io_sectors) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
                                    MIRROR_METHOD_ZERO :
                                    MIRROR_METHOD_DISCARD;
            }
        }

428 429 430 431 432
        while (s->in_flight >= MAX_IN_FLIGHT) {
            trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
            mirror_wait_for_io(s);
        }

433 434 435 436
        if (s->ret < 0) {
            return 0;
        }

437
        mirror_clip_sectors(s, sector_num, &io_sectors);
F
Fam Zheng 已提交
438 439 440
        switch (mirror_method) {
        case MIRROR_METHOD_COPY:
            io_sectors = mirror_do_read(s, sector_num, io_sectors);
441
            io_sectors_acct = io_sectors;
F
Fam Zheng 已提交
442 443 444
            break;
        case MIRROR_METHOD_ZERO:
        case MIRROR_METHOD_DISCARD:
445 446 447 448 449 450 451
            mirror_do_zero_or_discard(s, sector_num, io_sectors,
                                      mirror_method == MIRROR_METHOD_DISCARD);
            if (write_zeroes_ok) {
                io_sectors_acct = 0;
            } else {
                io_sectors_acct = io_sectors;
            }
F
Fam Zheng 已提交
452 453 454 455 456 457
            break;
        default:
            abort();
        }
        assert(io_sectors);
        sector_num += io_sectors;
458
        nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
459
        if (s->common.speed) {
460
            delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors_acct);
461
        }
462
    }
463
    return delay_ns;
464
}
465

466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
static void mirror_free_init(MirrorBlockJob *s)
{
    int granularity = s->granularity;
    size_t buf_size = s->buf_size;
    uint8_t *buf = s->buf;

    assert(s->buf_free_count == 0);
    QSIMPLEQ_INIT(&s->buf_free);
    while (buf_size != 0) {
        MirrorBuffer *cur = (MirrorBuffer *)buf;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
        s->buf_free_count++;
        buf_size -= granularity;
        buf += granularity;
    }
}

483 484 485 486 487
/* This is also used for the .pause callback. There is no matching
 * mirror_resume() because mirror_run() will begin iterating again
 * when the job is resumed.
 */
static void mirror_wait_for_all_io(MirrorBlockJob *s)
488 489
{
    while (s->in_flight > 0) {
F
Fam Zheng 已提交
490
        mirror_wait_for_io(s);
491
    }
P
Paolo Bonzini 已提交
492 493
}

494 495 496 497 498 499 500 501 502
typedef struct {
    int ret;
} MirrorExitData;

static void mirror_exit(BlockJob *job, void *opaque)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
503
    BlockDriverState *src = s->source;
K
Kevin Wolf 已提交
504
    BlockDriverState *target_bs = blk_bs(s->target);
505
    BlockDriverState *mirror_top_bs = s->mirror_top_bs;
506 507 508 509

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
    bdrv_ref(src);
510 511 512 513 514 515 516 517 518 519 520 521
    bdrv_ref(mirror_top_bs);

    /* We don't access the source any more. Dropping any WRITE/RESIZE is
     * required before it could become a backing file of target_bs. */
    bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL,
                            &error_abort);
    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
        BlockDriverState *backing = s->is_none_mode ? src : s->base;
        if (backing_bs(target_bs) != backing) {
            bdrv_set_backing_hd(target_bs, backing);
        }
    }
522 523 524 525 526 527 528

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);
    }

    if (s->should_complete && data->ret == 0) {
K
Kevin Wolf 已提交
529
        BlockDriverState *to_replace = src;
530 531 532
        if (s->to_replace) {
            to_replace = s->to_replace;
        }
533

K
Kevin Wolf 已提交
534 535
        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
536
        }
537 538 539

        /* The mirror job has no requests in flight any more, but we need to
         * drain potential other users of the BDS before changing the graph. */
K
Kevin Wolf 已提交
540 541 542
        bdrv_drained_begin(target_bs);
        bdrv_replace_in_backing_chain(to_replace, target_bs);
        bdrv_drained_end(target_bs);
543 544 545 546 547 548 549 550 551 552
    }
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
        bdrv_unref(s->to_replace);
    }
    if (replace_aio_context) {
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
K
Kevin Wolf 已提交
553
    blk_unref(s->target);
554
    s->target = NULL;
555 556 557 558 559 560 561 562 563 564 565 566 567 568 569

    /* Remove the mirror filter driver from the graph. Before this, get rid of
     * the blockers on the intermediate nodes so that the resulting state is
     * valid. */
    block_job_remove_all_bdrv(job);
    bdrv_replace_in_backing_chain(mirror_top_bs, backing_bs(mirror_top_bs));

    /* We just changed the BDS the job BB refers to (with either or both of the
     * bdrv_replace_in_backing_chain() calls), so switch the BB back so the
     * cleanup does the right thing. We don't need any permissions any more
     * now. */
    blk_remove_bs(job->blk);
    blk_set_perm(job->blk, 0, BLK_PERM_ALL, &error_abort);
    blk_insert_bs(job->blk, mirror_top_bs, &error_abort);

570
    block_job_completed(&s->common, data->ret);
571

572
    g_free(data);
573
    bdrv_drained_end(src);
574
    bdrv_unref(mirror_top_bs);
575
    bdrv_unref(src);
576 577
}

578 579 580 581 582 583 584 585 586 587 588 589
static void mirror_throttle(MirrorBlockJob *s)
{
    int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);

    if (now - s->last_pause_ns > SLICE_TIME) {
        s->last_pause_ns = now;
        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
    } else {
        block_job_pause_point(&s->common);
    }
}

590 591 592 593
static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
{
    int64_t sector_num, end;
    BlockDriverState *base = s->base;
594
    BlockDriverState *bs = s->source;
595 596 597 598 599
    BlockDriverState *target_bs = blk_bs(s->target);
    int ret, n;

    end = s->bdev_length / BDRV_SECTOR_SIZE;

600
    if (base == NULL && !bdrv_has_zero_init(target_bs)) {
601 602 603 604 605
        if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
            bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, end);
            return 0;
        }

606
        s->initial_zeroing_ongoing = true;
607 608 609 610 611 612 613
        for (sector_num = 0; sector_num < end; ) {
            int nb_sectors = MIN(end - sector_num,
                QEMU_ALIGN_DOWN(INT_MAX, s->granularity) >> BDRV_SECTOR_BITS);

            mirror_throttle(s);

            if (block_job_is_cancelled(&s->common)) {
614
                s->initial_zeroing_ongoing = false;
615 616 617 618 619 620 621 622 623 624 625 626 627
                return 0;
            }

            if (s->in_flight >= MAX_IN_FLIGHT) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, -1);
                mirror_wait_for_io(s);
                continue;
            }

            mirror_do_zero_or_discard(s, sector_num, nb_sectors, false);
            sector_num += nb_sectors;
        }

628
        mirror_wait_for_all_io(s);
629
        s->initial_zeroing_ongoing = false;
630 631
    }

632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649
    /* First part, loop on the sectors and initialize the dirty bitmap.  */
    for (sector_num = 0; sector_num < end; ) {
        /* Just to make sure we are not exceeding int limit. */
        int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
                             end - sector_num);

        mirror_throttle(s);

        if (block_job_is_cancelled(&s->common)) {
            return 0;
        }

        ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
        if (ret < 0) {
            return ret;
        }

        assert(n > 0);
650
        if (ret == 1) {
651 652 653 654 655 656 657
            bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
        }
        sector_num += n;
    }
    return 0;
}

658 659 660 661 662 663 664 665 666 667 668 669 670 671
/* Called when going out of the streaming phase to flush the bulk of the
 * data to the medium, or just before completing.
 */
static int mirror_flush(MirrorBlockJob *s)
{
    int ret = blk_flush(s->target);
    if (ret < 0) {
        if (mirror_error_action(s, false, -ret) == BLOCK_ERROR_ACTION_REPORT) {
            s->ret = ret;
        }
    }
    return ret;
}

P
Paolo Bonzini 已提交
672 673 674
static void coroutine_fn mirror_run(void *opaque)
{
    MirrorBlockJob *s = opaque;
675
    MirrorExitData *data;
676
    BlockDriverState *bs = s->source;
K
Kevin Wolf 已提交
677
    BlockDriverState *target_bs = blk_bs(s->target);
678
    bool need_drain = true;
679
    int64_t length;
680
    BlockDriverInfo bdi;
681 682
    char backing_filename[2]; /* we only need 2 characters because we are only
                                 checking for a NULL string */
P
Paolo Bonzini 已提交
683
    int ret = 0;
F
Fam Zheng 已提交
684
    int target_cluster_size = BDRV_SECTOR_SIZE;
P
Paolo Bonzini 已提交
685 686 687 688 689

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
    }

M
Max Reitz 已提交
690 691 692
    s->bdev_length = bdrv_getlength(bs);
    if (s->bdev_length < 0) {
        ret = s->bdev_length;
693
        goto immediate_exit;
694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715
    }

    /* Active commit must resize the base image if its size differs from the
     * active layer. */
    if (s->base == blk_bs(s->target)) {
        int64_t base_length;

        base_length = blk_getlength(s->target);
        if (base_length < 0) {
            ret = base_length;
            goto immediate_exit;
        }

        if (s->bdev_length > base_length) {
            ret = blk_truncate(s->target, s->bdev_length);
            if (ret < 0) {
                goto immediate_exit;
            }
        }
    }

    if (s->bdev_length == 0) {
716 717 718 719 720 721 722 723
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
        while (!block_job_is_cancelled(&s->common) && !s->should_complete) {
            block_job_yield(&s->common);
        }
        s->common.cancelled = false;
        goto immediate_exit;
P
Paolo Bonzini 已提交
724 725
    }

M
Max Reitz 已提交
726
    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
727 728
    s->in_flight_bitmap = bitmap_new(length);

729 730 731 732
    /* If we have no backing file yet in the destination, we cannot let
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
K
Kevin Wolf 已提交
733
    bdrv_get_backing_filename(target_bs, backing_filename,
734
                              sizeof(backing_filename));
K
Kevin Wolf 已提交
735
    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
F
Fam Zheng 已提交
736 737
        target_cluster_size = bdi.cluster_size;
    }
K
Kevin Wolf 已提交
738
    if (backing_filename[0] && !target_bs->backing
F
Fam Zheng 已提交
739 740 741
        && s->granularity < target_cluster_size) {
        s->buf_size = MAX(s->buf_size, target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
742
    }
F
Fam Zheng 已提交
743
    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
K
Kevin Wolf 已提交
744
    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
745

746 747 748 749 750 751
    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
        ret = -ENOMEM;
        goto immediate_exit;
    }

752
    mirror_free_init(s);
P
Paolo Bonzini 已提交
753

754
    s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
F
Fam Zheng 已提交
755
    if (!s->is_none_mode) {
756 757 758
        ret = mirror_dirty_init(s);
        if (ret < 0 || block_job_is_cancelled(&s->common)) {
            goto immediate_exit;
P
Paolo Bonzini 已提交
759 760 761
        }
    }

762 763
    assert(!s->dbi);
    s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap, 0);
P
Paolo Bonzini 已提交
764
    for (;;) {
765
        uint64_t delay_ns = 0;
766
        int64_t cnt, delta;
P
Paolo Bonzini 已提交
767 768
        bool should_complete;

769 770 771 772 773
        if (s->ret < 0) {
            ret = s->ret;
            goto immediate_exit;
        }

774 775
        block_job_pause_point(&s->common);

776
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
M
Max Reitz 已提交
777 778 779 780 781 782
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
         * s->sectors_in_flight is the number of sectors currently being
         * processed; together those are the current total operation length */
        s->common.len = s->common.offset +
                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
783 784

        /* Note that even when no rate limit is applied we need to yield
785
         * periodically with no pending I/O so that bdrv_drain_all() returns.
786 787 788
         * We do so every SLICE_TIME nanoseconds, or when there is an error,
         * or when the source is clean, whichever comes first.
         */
789 790
        delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
        if (delta < SLICE_TIME &&
791
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
792
            if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
793 794
                (cnt == 0 && s->in_flight > 0)) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
F
Fam Zheng 已提交
795
                mirror_wait_for_io(s);
796 797
                continue;
            } else if (cnt != 0) {
798
                delay_ns = mirror_iteration(s);
P
Paolo Bonzini 已提交
799 800 801 802
            }
        }

        should_complete = false;
803
        if (s->in_flight == 0 && cnt == 0) {
P
Paolo Bonzini 已提交
804
            trace_mirror_before_flush(s);
805 806 807 808
            if (!s->synced) {
                if (mirror_flush(s) < 0) {
                    /* Go check s->ret.  */
                    continue;
809 810 811 812 813 814
                }
                /* We're out of the streaming phase.  From now on, if the job
                 * is cancelled we will actually complete all pending I/O and
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
815 816
                block_job_event_ready(&s->common);
                s->synced = true;
P
Paolo Bonzini 已提交
817
            }
818 819 820 821

            should_complete = s->should_complete ||
                block_job_is_cancelled(&s->common);
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
822 823 824 825 826 827 828 829 830
        }

        if (cnt == 0 && should_complete) {
            /* The dirty bitmap is not updated while operations are pending.
             * If we're about to exit, wait for pending operations before
             * calling bdrv_get_dirty_count(bs), or we may exit while the
             * source has dirty data to copy!
             *
             * Note that I/O can be submitted by the guest while
831 832 833
             * mirror_populate runs, so pause it now.  Before deciding
             * whether to switch to target check one last time if I/O has
             * come in the meanwhile, and if not flush the data to disk.
P
Paolo Bonzini 已提交
834 835
             */
            trace_mirror_before_drain(s, cnt);
836 837

            bdrv_drained_begin(bs);
838
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
839
            if (cnt > 0 || mirror_flush(s) < 0) {
840 841 842 843 844 845 846 847 848 849 850
                bdrv_drained_end(bs);
                continue;
            }

            /* The two disks are in sync.  Exit and report successful
             * completion.
             */
            assert(QLIST_EMPTY(&bs->tracked_requests));
            s->common.cancelled = false;
            need_drain = false;
            break;
P
Paolo Bonzini 已提交
851 852 853
        }

        ret = 0;
854
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
P
Paolo Bonzini 已提交
855
        if (!s->synced) {
856
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
857 858 859 860
            if (block_job_is_cancelled(&s->common)) {
                break;
            }
        } else if (!should_complete) {
861
            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
862
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
863
        }
864
        s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
P
Paolo Bonzini 已提交
865 866 867
    }

immediate_exit:
868 869 870 871 872 873
    if (s->in_flight > 0) {
        /* We get here only if something went wrong.  Either the job failed,
         * or it was cancelled prematurely so that we do not guarantee that
         * the target is a copy of the source.
         */
        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
874
        assert(need_drain);
875
        mirror_wait_for_all_io(s);
876 877 878
    }

    assert(s->in_flight == 0);
879
    qemu_vfree(s->buf);
880
    g_free(s->cow_bitmap);
881
    g_free(s->in_flight_bitmap);
882
    bdrv_dirty_iter_free(s->dbi);
F
Fam Zheng 已提交
883
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
884 885 886

    data = g_malloc(sizeof(*data));
    data->ret = ret;
887 888 889 890

    if (need_drain) {
        bdrv_drained_begin(bs);
    }
891
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
P
Paolo Bonzini 已提交
892 893 894 895 896 897 898
}

static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    if (speed < 0) {
899
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
P
Paolo Bonzini 已提交
900 901 902 903 904
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

P
Paolo Bonzini 已提交
905 906 907
static void mirror_complete(BlockJob *job, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
908
    BlockDriverState *target;
M
Max Reitz 已提交
909 910

    target = blk_bs(s->target);
P
Paolo Bonzini 已提交
911 912

    if (!s->synced) {
913 914
        error_setg(errp, "The active block job '%s' cannot be completed",
                   job->id);
P
Paolo Bonzini 已提交
915 916 917
        return;
    }

M
Max Reitz 已提交
918 919 920 921 922 923 924 925 926 927
    if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
        int ret;

        assert(!target->backing);
        ret = bdrv_open_backing_file(target, NULL, "backing", errp);
        if (ret < 0) {
            return;
        }
    }

C
Changlong Xie 已提交
928
    /* block all operations on to_replace bs */
929
    if (s->replaces) {
930 931
        AioContext *replace_aio_context;

932
        s->to_replace = bdrv_find_node(s->replaces);
933
        if (!s->to_replace) {
934
            error_setg(errp, "Node name '%s' not found", s->replaces);
935 936 937
            return;
        }

938 939 940
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);

941 942 943 944
        /* TODO Translate this into permission system. Current definition of
         * GRAPH_MOD would require to request it for the parents; they might
         * not even be BlockDriverStates, however, so a BdrvChild can't address
         * them. May need redefinition of GRAPH_MOD. */
945 946 947 948
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
949 950

        aio_context_release(replace_aio_context);
951 952
    }

P
Paolo Bonzini 已提交
953
    s->should_complete = true;
F
Fam Zheng 已提交
954
    block_job_enter(&s->common);
P
Paolo Bonzini 已提交
955 956
}

957
static void mirror_pause(BlockJob *job)
958 959 960
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

961
    mirror_wait_for_all_io(s);
962 963 964 965 966 967 968 969 970
}

static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    blk_set_aio_context(s->target, new_context);
}

971 972 973 974 975 976 977 978 979 980 981 982 983 984 985
static void mirror_drain(BlockJob *job)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    /* Need to keep a reference in case blk_drain triggers execution
     * of mirror_complete...
     */
    if (s->target) {
        BlockBackend *target = s->target;
        blk_ref(target);
        blk_drain(target);
        blk_unref(target);
    }
}

986
static const BlockJobDriver mirror_job_driver = {
987 988 989
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_MIRROR,
    .set_speed              = mirror_set_speed,
J
John Snow 已提交
990
    .start                  = mirror_run,
991 992 993
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
994
    .drain                  = mirror_drain,
P
Paolo Bonzini 已提交
995 996
};

F
Fam Zheng 已提交
997
static const BlockJobDriver commit_active_job_driver = {
998 999 1000
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_COMMIT,
    .set_speed              = mirror_set_speed,
J
John Snow 已提交
1001
    .start                  = mirror_run,
1002 1003 1004
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
1005
    .drain                  = mirror_drain,
F
Fam Zheng 已提交
1006 1007
};

1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078
static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
{
    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
}

static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
{
    return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
}

static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
{
    return bdrv_co_flush(bs->backing->bs);
}

static int64_t coroutine_fn bdrv_mirror_top_get_block_status(
    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
    BlockDriverState **file)
{
    *pnum = nb_sectors;
    *file = bs->backing->bs;
    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
           (sector_num << BDRV_SECTOR_BITS);
}

static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
    int64_t offset, int count, BdrvRequestFlags flags)
{
    return bdrv_co_pwrite_zeroes(bs->backing, offset, count, flags);
}

static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,
    int64_t offset, int count)
{
    return bdrv_co_pdiscard(bs->backing->bs, offset, count);
}

static void bdrv_mirror_top_close(BlockDriverState *bs)
{
}

static void bdrv_mirror_top_child_perm(BlockDriverState *bs, BdrvChild *c,
                                       const BdrvChildRole *role,
                                       uint64_t perm, uint64_t shared,
                                       uint64_t *nperm, uint64_t *nshared)
{
    /* Must be able to forward guest writes to the real image */
    *nperm = 0;
    if (perm & BLK_PERM_WRITE) {
        *nperm |= BLK_PERM_WRITE;
    }

    *nshared = BLK_PERM_ALL;
}

/* Dummy node that provides consistent read to its users without requiring it
 * from its backing file and that allows writes on the backing file chain. */
static BlockDriver bdrv_mirror_top = {
    .format_name                = "mirror_top",
    .bdrv_co_preadv             = bdrv_mirror_top_preadv,
    .bdrv_co_pwritev            = bdrv_mirror_top_pwritev,
    .bdrv_co_pwrite_zeroes      = bdrv_mirror_top_pwrite_zeroes,
    .bdrv_co_pdiscard           = bdrv_mirror_top_pdiscard,
    .bdrv_co_flush              = bdrv_mirror_top_flush,
    .bdrv_co_get_block_status   = bdrv_mirror_top_get_block_status,
    .bdrv_close                 = bdrv_mirror_top_close,
    .bdrv_child_perm            = bdrv_mirror_top_child_perm,
};

1079
static void mirror_start_job(const char *job_id, BlockDriverState *bs,
1080 1081 1082
                             int creation_flags, BlockDriverState *target,
                             const char *replaces, int64_t speed,
                             uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
1083
                             BlockMirrorBackingMode backing_mode,
1084 1085
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
1086
                             bool unmap,
1087
                             BlockCompletionFunc *cb,
1088 1089
                             void *opaque, Error **errp,
                             const BlockJobDriver *driver,
1090
                             bool is_none_mode, BlockDriverState *base,
1091
                             bool auto_complete, const char *filter_node_name)
P
Paolo Bonzini 已提交
1092 1093
{
    MirrorBlockJob *s;
1094 1095 1096
    BlockDriverState *mirror_top_bs;
    bool target_graph_mod;
    bool target_is_backing;
1097
    int ret;
P
Paolo Bonzini 已提交
1098

1099
    if (granularity == 0) {
1100
        granularity = bdrv_get_default_bitmap_granularity(target);
1101 1102 1103 1104
    }

    assert ((granularity & (granularity - 1)) == 0);

W
Wen Congyang 已提交
1105 1106 1107 1108 1109 1110 1111 1112
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
    }

    if (buf_size == 0) {
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }
F
Fam Zheng 已提交
1113

1114 1115 1116
    /* In the case of active commit, add dummy driver to provide consistent
     * reads on the top, while disabling it in the intermediate nodes, and make
     * the backing chain writable. */
1117 1118
    mirror_top_bs = bdrv_new_open_driver(&bdrv_mirror_top, filter_node_name,
                                         BDRV_O_RDWR, errp);
1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
    if (mirror_top_bs == NULL) {
        return;
    }
    mirror_top_bs->total_sectors = bs->total_sectors;

    /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
     * it alive until block_job_create() even if bs has no parent. */
    bdrv_ref(mirror_top_bs);
    bdrv_drained_begin(bs);
    bdrv_append(mirror_top_bs, bs);
    bdrv_drained_end(bs);

    /* Make sure that the source is not resized while the job is running */
    s = block_job_create(job_id, driver, mirror_top_bs,
                         BLK_PERM_CONSISTENT_READ,
                         BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
                         BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD, speed,
1136
                         creation_flags, cb, opaque, errp);
1137
    bdrv_unref(mirror_top_bs);
P
Paolo Bonzini 已提交
1138
    if (!s) {
1139
        goto fail;
P
Paolo Bonzini 已提交
1140
    }
1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159
    s->source = bs;
    s->mirror_top_bs = mirror_top_bs;

    /* No resize for the target either; while the mirror is still running, a
     * consistent read isn't necessarily possible. We could possibly allow
     * writes and graph modifications, though it would likely defeat the
     * purpose of a mirror, so leave them blocked for now.
     *
     * In the case of active commit, things look a bit different, though,
     * because the target is an already populated backing file in active use.
     * We can allow anything except resize there.*/
    target_is_backing = bdrv_chain_contains(bs, target);
    target_graph_mod = (backing_mode != MIRROR_LEAVE_BACKING_CHAIN);
    s->target = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE |
                        (target_graph_mod ? BLK_PERM_GRAPH_MOD : 0),
                        BLK_PERM_WRITE_UNCHANGED |
                        (target_is_backing ? BLK_PERM_CONSISTENT_READ |
                                             BLK_PERM_WRITE |
                                             BLK_PERM_GRAPH_MOD : 0));
1160 1161
    ret = blk_insert_bs(s->target, target, errp);
    if (ret < 0) {
1162
        goto fail;
1163
    }
K
Kevin Wolf 已提交
1164

1165
    s->replaces = g_strdup(replaces);
1166 1167
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
F
Fam Zheng 已提交
1168
    s->is_none_mode = is_none_mode;
M
Max Reitz 已提交
1169
    s->backing_mode = backing_mode;
F
Fam Zheng 已提交
1170
    s->base = base;
1171
    s->granularity = granularity;
W
Wen Congyang 已提交
1172
    s->buf_size = ROUND_UP(buf_size, granularity);
1173
    s->unmap = unmap;
1174 1175 1176
    if (auto_complete) {
        s->should_complete = true;
    }
1177

1178
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
1179
    if (!s->dirty_bitmap) {
1180
        g_free(s->replaces);
K
Kevin Wolf 已提交
1181
        blk_unref(s->target);
1182
        block_job_unref(&s->common);
1183 1184
        return;
    }
1185

1186
    /* Required permissions are already taken with blk_new() */
1187 1188 1189
    block_job_add_bdrv(&s->common, "target", target, 0, BLK_PERM_ALL,
                       &error_abort);

1190 1191
    /* In commit_active_start() all intermediate nodes disappear, so
     * any jobs in them must be blocked */
1192
    if (target_is_backing) {
1193 1194
        BlockDriverState *iter;
        for (iter = backing_bs(bs); iter != target; iter = backing_bs(iter)) {
1195 1196 1197 1198 1199 1200 1201 1202 1203 1204
            /* XXX BLK_PERM_WRITE needs to be allowed so we don't block
             * ourselves at s->base (if writes are blocked for a node, they are
             * also blocked for its backing file). The other options would be a
             * second filter driver above s->base (== target). */
            ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
                                     BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
                                     errp);
            if (ret < 0) {
                goto fail;
            }
1205 1206
        }
    }
1207

J
John Snow 已提交
1208 1209
    trace_mirror_start(bs, s, opaque);
    block_job_start(&s->common);
1210 1211 1212 1213 1214 1215 1216 1217 1218 1219
    return;

fail:
    if (s) {
        g_free(s->replaces);
        blk_unref(s->target);
        block_job_unref(&s->common);
    }

    bdrv_replace_in_backing_chain(mirror_top_bs, backing_bs(mirror_top_bs));
P
Paolo Bonzini 已提交
1220
}
F
Fam Zheng 已提交
1221

1222 1223
void mirror_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *target, const char *replaces,
1224
                  int64_t speed, uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
1225 1226
                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                  BlockdevOnError on_source_error,
F
Fam Zheng 已提交
1227
                  BlockdevOnError on_target_error,
1228
                  bool unmap, const char *filter_node_name, Error **errp)
F
Fam Zheng 已提交
1229 1230 1231 1232
{
    bool is_none_mode;
    BlockDriverState *base;

1233 1234
    if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        error_setg(errp, "Sync mode 'incremental' not supported");
1235 1236
        return;
    }
F
Fam Zheng 已提交
1237
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
1238
    base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
1239
    mirror_start_job(job_id, bs, BLOCK_JOB_DEFAULT, target, replaces,
M
Max Reitz 已提交
1240
                     speed, granularity, buf_size, backing_mode,
1241
                     on_source_error, on_target_error, unmap, NULL, NULL, errp,
1242 1243
                     &mirror_job_driver, is_none_mode, base, false,
                     filter_node_name);
F
Fam Zheng 已提交
1244 1245
}

1246
void commit_active_start(const char *job_id, BlockDriverState *bs,
1247 1248
                         BlockDriverState *base, int creation_flags,
                         int64_t speed, BlockdevOnError on_error,
1249
                         const char *filter_node_name,
1250
                         BlockCompletionFunc *cb, void *opaque, Error **errp,
1251
                         bool auto_complete)
F
Fam Zheng 已提交
1252
{
1253
    int orig_base_flags;
1254
    Error *local_err = NULL;
1255 1256 1257

    orig_base_flags = bdrv_get_flags(base);

F
Fam Zheng 已提交
1258 1259 1260
    if (bdrv_reopen(base, bs->open_flags, errp)) {
        return;
    }
1261

1262
    mirror_start_job(job_id, bs, creation_flags, base, NULL, speed, 0, 0,
1263
                     MIRROR_LEAVE_BACKING_CHAIN,
1264
                     on_error, on_error, true, cb, opaque, &local_err,
1265
                     &commit_active_job_driver, false, base, auto_complete,
1266
                     filter_node_name);
1267
    if (local_err) {
1268
        error_propagate(errp, local_err);
1269 1270 1271 1272 1273 1274 1275 1276 1277 1278
        goto error_restore_flags;
    }

    return;

error_restore_flags:
    /* ignore error and errp for bdrv_reopen, because we want to propagate
     * the original error */
    bdrv_reopen(base, orig_base_flags, NULL);
    return;
F
Fam Zheng 已提交
1279
}