mirror.c 44.6 KB
Newer Older
P
Paolo Bonzini 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Image mirroring
 *
 * Copyright Red Hat, Inc. 2012
 *
 * Authors:
 *  Paolo Bonzini  <pbonzini@redhat.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
14
#include "qemu/osdep.h"
15
#include "qemu/cutils.h"
P
Paolo Bonzini 已提交
16
#include "trace.h"
17
#include "block/blockjob_int.h"
18
#include "block/block_int.h"
19
#include "sysemu/block-backend.h"
20
#include "qapi/error.h"
21
#include "qapi/qmp/qerror.h"
P
Paolo Bonzini 已提交
22
#include "qemu/ratelimit.h"
23
#include "qemu/bitmap.h"
P
Paolo Bonzini 已提交
24

25 26
#define SLICE_TIME    100000000ULL /* ns */
#define MAX_IN_FLIGHT 16
27 28 29
#define MAX_IO_SECTORS ((1 << 20) >> BDRV_SECTOR_BITS) /* 1 Mb */
#define DEFAULT_MIRROR_BUF_SIZE \
    (MAX_IN_FLIGHT * MAX_IO_SECTORS * BDRV_SECTOR_SIZE)
30 31 32 33 34 35 36

/* The mirroring buffer is a list of granularity-sized chunks.
 * Free chunks are organized in a list.
 */
typedef struct MirrorBuffer {
    QSIMPLEQ_ENTRY(MirrorBuffer) next;
} MirrorBuffer;
P
Paolo Bonzini 已提交
37 38 39 40

typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
K
Kevin Wolf 已提交
41
    BlockBackend *target;
42 43
    BlockDriverState *mirror_top_bs;
    BlockDriverState *source;
F
Fam Zheng 已提交
44
    BlockDriverState *base;
45

46 47 48 49 50 51
    /* The name of the graph node to replace */
    char *replaces;
    /* The BDS to replace */
    BlockDriverState *to_replace;
    /* Used to block operations on the drive-mirror-replace target */
    Error *replace_blocker;
F
Fam Zheng 已提交
52
    bool is_none_mode;
M
Max Reitz 已提交
53
    BlockMirrorBackingMode backing_mode;
54
    BlockdevOnError on_source_error, on_target_error;
P
Paolo Bonzini 已提交
55 56
    bool synced;
    bool should_complete;
57
    int64_t granularity;
58
    size_t buf_size;
M
Max Reitz 已提交
59
    int64_t bdev_length;
60
    unsigned long *cow_bitmap;
F
Fam Zheng 已提交
61
    BdrvDirtyBitmap *dirty_bitmap;
62
    BdrvDirtyBitmapIter *dbi;
P
Paolo Bonzini 已提交
63
    uint8_t *buf;
64 65
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;
66

67
    uint64_t last_pause_ns;
68
    unsigned long *in_flight_bitmap;
69
    int in_flight;
70
    int64_t sectors_in_flight;
71
    int ret;
72
    bool unmap;
K
Kevin Wolf 已提交
73
    bool waiting_for_io;
F
Fam Zheng 已提交
74 75
    int target_cluster_sectors;
    int max_iov;
76
    bool initial_zeroing_ongoing;
P
Paolo Bonzini 已提交
77 78
} MirrorBlockJob;

79 80 81 82 83 84 85
typedef struct MirrorOp {
    MirrorBlockJob *s;
    QEMUIOVector qiov;
    int64_t sector_num;
    int nb_sectors;
} MirrorOp;

86 87 88 89 90
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                            int error)
{
    s->synced = false;
    if (read) {
91 92
        return block_job_error_action(&s->common, s->on_source_error,
                                      true, error);
93
    } else {
94 95
        return block_job_error_action(&s->common, s->on_target_error,
                                      false, error);
96 97 98
    }
}

99 100 101
static void mirror_iteration_done(MirrorOp *op, int ret)
{
    MirrorBlockJob *s = op->s;
102
    struct iovec *iov;
103
    int64_t chunk_num;
104
    int i, nb_chunks, sectors_per_chunk;
105 106 107 108

    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
M
Max Reitz 已提交
109
    s->sectors_in_flight -= op->nb_sectors;
110 111 112 113 114 115 116
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
        s->buf_free_count++;
    }

117 118
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    chunk_num = op->sector_num / sectors_per_chunk;
119
    nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk);
120
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
M
Max Reitz 已提交
121 122 123 124
    if (ret >= 0) {
        if (s->cow_bitmap) {
            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
        }
125 126 127
        if (!s->initial_zeroing_ongoing) {
            s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
        }
128
    }
Z
Zhang Min 已提交
129
    qemu_iovec_destroy(&op->qiov);
130
    g_free(op);
131

K
Kevin Wolf 已提交
132
    if (s->waiting_for_io) {
133
        qemu_coroutine_enter(s->common.co);
134
    }
135 136 137 138 139 140
}

static void mirror_write_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
141 142

    aio_context_acquire(blk_get_aio_context(s->common.blk));
143 144 145
    if (ret < 0) {
        BlockErrorAction action;

146
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
147
        action = mirror_error_action(s, false, -ret);
W
Wenchao Xia 已提交
148
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
149 150 151 152
            s->ret = ret;
        }
    }
    mirror_iteration_done(op, ret);
153
    aio_context_release(blk_get_aio_context(s->common.blk));
154 155 156 157 158 159
}

static void mirror_read_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
160 161

    aio_context_acquire(blk_get_aio_context(s->common.blk));
162 163 164
    if (ret < 0) {
        BlockErrorAction action;

165
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
166
        action = mirror_error_action(s, true, -ret);
W
Wenchao Xia 已提交
167
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
168 169 170 171
            s->ret = ret;
        }

        mirror_iteration_done(op, ret);
172 173 174
    } else {
        blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
                        0, mirror_write_complete, op);
175
    }
176
    aio_context_release(blk_get_aio_context(s->common.blk));
177 178
}

179 180 181 182 183 184 185 186
static inline void mirror_clip_sectors(MirrorBlockJob *s,
                                       int64_t sector_num,
                                       int *nb_sectors)
{
    *nb_sectors = MIN(*nb_sectors,
                      s->bdev_length / BDRV_SECTOR_SIZE - sector_num);
}

F
Fam Zheng 已提交
187 188 189 190 191
/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
 * return the offset of the adjusted tail sector against original. */
static int mirror_cow_align(MirrorBlockJob *s,
                            int64_t *sector_num,
                            int *nb_sectors)
P
Paolo Bonzini 已提交
192
{
F
Fam Zheng 已提交
193 194 195 196 197 198 199 200 201 202 203
    bool need_cow;
    int ret = 0;
    int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS;
    int64_t align_sector_num = *sector_num;
    int align_nb_sectors = *nb_sectors;
    int max_sectors = chunk_sectors * s->max_iov;

    need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap);
    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                          s->cow_bitmap);
    if (need_cow) {
204 205 206
        bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
                                       *nb_sectors, &align_sector_num,
                                       &align_nb_sectors);
F
Fam Zheng 已提交
207
    }
208

F
Fam Zheng 已提交
209 210 211 212 213 214
    if (align_nb_sectors > max_sectors) {
        align_nb_sectors = max_sectors;
        if (need_cow) {
            align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors,
                                               s->target_cluster_sectors);
        }
215
    }
216 217 218
    /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but
     * that doesn't matter because it's already the end of source image. */
    mirror_clip_sectors(s, align_sector_num, &align_nb_sectors);
219

F
Fam Zheng 已提交
220 221 222 223 224 225 226
    ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
    *sector_num = align_sector_num;
    *nb_sectors = align_nb_sectors;
    assert(ret >= 0);
    return ret;
}

F
Fam Zheng 已提交
227 228 229 230 231 232 233 234
static inline void mirror_wait_for_io(MirrorBlockJob *s)
{
    assert(!s->waiting_for_io);
    s->waiting_for_io = true;
    qemu_coroutine_yield();
    s->waiting_for_io = false;
}

F
Fam Zheng 已提交
235
/* Submit async read while handling COW.
236 237 238
 * Returns: The number of sectors copied after and including sector_num,
 *          excluding any sectors copied prior to sector_num due to alignment.
 *          This will be nb_sectors if no alignment is necessary, or
F
Fam Zheng 已提交
239 240 241 242 243 244
 *          (new_end - sector_num) if tail is rounded up or down due to
 *          alignment or buffer limit.
 */
static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                          int nb_sectors)
{
K
Kevin Wolf 已提交
245
    BlockBackend *source = s->common.blk;
F
Fam Zheng 已提交
246
    int sectors_per_chunk, nb_chunks;
247
    int ret;
F
Fam Zheng 已提交
248
    MirrorOp *op;
249
    int max_sectors;
F
Fam Zheng 已提交
250

251
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
252
    max_sectors = sectors_per_chunk * s->max_iov;
253

F
Fam Zheng 已提交
254 255
    /* We can only handle as much as buf_size at a time. */
    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
256
    nb_sectors = MIN(max_sectors, nb_sectors);
F
Fam Zheng 已提交
257
    assert(nb_sectors);
258
    ret = nb_sectors;
259

F
Fam Zheng 已提交
260 261 262 263 264 265 266 267
    if (s->cow_bitmap) {
        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
    }
    assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size);
    /* The sector range must meet granularity because:
     * 1) Caller passes in aligned values;
     * 2) mirror_cow_align is used only when target cluster is larger. */
    assert(!(sector_num % sectors_per_chunk));
268
    nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk);
F
Fam Zheng 已提交
269 270

    while (s->buf_free_count < nb_chunks) {
271
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
F
Fam Zheng 已提交
272
        mirror_wait_for_io(s);
273 274
    }

275
    /* Allocate a MirrorOp that is used as an AIO callback.  */
276
    op = g_new(MirrorOp, 1);
277 278 279
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;
280 281 282 283 284 285 286

    /* Now make a QEMUIOVector taking enough granularity-sized chunks
     * from s->buf_free.
     */
    qemu_iovec_init(&op->qiov, nb_chunks);
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
F
Fam Zheng 已提交
287
        size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size;
288

289 290
        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
291
        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
292
    }
293

P
Paolo Bonzini 已提交
294
    /* Copy the dirty cluster.  */
295
    s->in_flight++;
M
Max Reitz 已提交
296
    s->sectors_in_flight += nb_sectors;
297
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
298

299
    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
F
Fam Zheng 已提交
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
                   mirror_read_complete, op);
    return ret;
}

static void mirror_do_zero_or_discard(MirrorBlockJob *s,
                                      int64_t sector_num,
                                      int nb_sectors,
                                      bool is_discard)
{
    MirrorOp *op;

    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
     * so the freeing in mirror_iteration_done is nop. */
    op = g_new0(MirrorOp, 1);
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;

    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    if (is_discard) {
321 322 323
        blk_aio_pdiscard(s->target, sector_num << BDRV_SECTOR_BITS,
                         op->nb_sectors << BDRV_SECTOR_BITS,
                         mirror_write_complete, op);
F
Fam Zheng 已提交
324
    } else {
K
Kevin Wolf 已提交
325 326
        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
                              op->nb_sectors * BDRV_SECTOR_SIZE,
327 328
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
F
Fam Zheng 已提交
329 330 331 332 333
    }
}

static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
{
334
    BlockDriverState *source = s->source;
335
    int64_t sector_num, first_chunk;
F
Fam Zheng 已提交
336 337 338 339 340
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
    int nb_chunks = 1;
    int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
    int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
341
    bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
342 343
    int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT,
                             MAX_IO_SECTORS);
F
Fam Zheng 已提交
344

345
    sector_num = bdrv_dirty_iter_next(s->dbi);
F
Fam Zheng 已提交
346
    if (sector_num < 0) {
347 348
        bdrv_set_dirty_iter(s->dbi, 0);
        sector_num = bdrv_dirty_iter_next(s->dbi);
F
Fam Zheng 已提交
349 350 351 352
        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
        assert(sector_num >= 0);
    }

353 354
    first_chunk = sector_num / sectors_per_chunk;
    while (test_bit(first_chunk, s->in_flight_bitmap)) {
355
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
356 357 358
        mirror_wait_for_io(s);
    }

359 360
    block_job_pause_point(&s->common);

F
Fam Zheng 已提交
361 362 363
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
364
        int64_t next_dirty;
F
Fam Zheng 已提交
365 366 367 368 369 370 371
        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
        int64_t next_chunk = next_sector / sectors_per_chunk;
        if (next_sector >= end ||
            !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
            break;
        }
        if (test_bit(next_chunk, s->in_flight_bitmap)) {
372
            break;
F
Fam Zheng 已提交
373
        }
374

375 376
        next_dirty = bdrv_dirty_iter_next(s->dbi);
        if (next_dirty > next_sector || next_dirty < 0) {
377
            /* The bitmap iterator's cache is stale, refresh it */
378 379
            bdrv_set_dirty_iter(s->dbi, next_sector);
            next_dirty = bdrv_dirty_iter_next(s->dbi);
380
        }
381
        assert(next_dirty == next_sector);
382
        nb_chunks++;
F
Fam Zheng 已提交
383 384 385 386 387 388 389 390 391 392
    }

    /* Clear dirty bits before querying the block status, because
     * calling bdrv_get_block_status_above could yield - if some blocks are
     * marked dirty in this window, we need to know.
     */
    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num,
                            nb_chunks * sectors_per_chunk);
    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
    while (nb_chunks > 0 && sector_num < end) {
393
        int64_t ret;
394
        int io_sectors, io_sectors_acct;
F
Fam Zheng 已提交
395 396 397 398 399 400 401 402 403 404 405 406
        BlockDriverState *file;
        enum MirrorMethod {
            MIRROR_METHOD_COPY,
            MIRROR_METHOD_ZERO,
            MIRROR_METHOD_DISCARD
        } mirror_method = MIRROR_METHOD_COPY;

        assert(!(sector_num % sectors_per_chunk));
        ret = bdrv_get_block_status_above(source, NULL, sector_num,
                                          nb_chunks * sectors_per_chunk,
                                          &io_sectors, &file);
        if (ret < 0) {
407 408 409
            io_sectors = MIN(nb_chunks * sectors_per_chunk, max_io_sectors);
        } else if (ret & BDRV_BLOCK_DATA) {
            io_sectors = MIN(io_sectors, max_io_sectors);
F
Fam Zheng 已提交
410 411 412 413 414 415 416 417
        }

        io_sectors -= io_sectors % sectors_per_chunk;
        if (io_sectors < sectors_per_chunk) {
            io_sectors = sectors_per_chunk;
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_sector_num;
            int target_nb_sectors;
418 419 420
            bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
                                           io_sectors,  &target_sector_num,
                                           &target_nb_sectors);
F
Fam Zheng 已提交
421 422 423 424 425 426 427 428
            if (target_sector_num == sector_num &&
                target_nb_sectors == io_sectors) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
                                    MIRROR_METHOD_ZERO :
                                    MIRROR_METHOD_DISCARD;
            }
        }

429 430 431 432 433
        while (s->in_flight >= MAX_IN_FLIGHT) {
            trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
            mirror_wait_for_io(s);
        }

434 435 436 437
        if (s->ret < 0) {
            return 0;
        }

438
        mirror_clip_sectors(s, sector_num, &io_sectors);
F
Fam Zheng 已提交
439 440 441
        switch (mirror_method) {
        case MIRROR_METHOD_COPY:
            io_sectors = mirror_do_read(s, sector_num, io_sectors);
442
            io_sectors_acct = io_sectors;
F
Fam Zheng 已提交
443 444 445
            break;
        case MIRROR_METHOD_ZERO:
        case MIRROR_METHOD_DISCARD:
446 447 448 449 450 451 452
            mirror_do_zero_or_discard(s, sector_num, io_sectors,
                                      mirror_method == MIRROR_METHOD_DISCARD);
            if (write_zeroes_ok) {
                io_sectors_acct = 0;
            } else {
                io_sectors_acct = io_sectors;
            }
F
Fam Zheng 已提交
453 454 455 456 457 458
            break;
        default:
            abort();
        }
        assert(io_sectors);
        sector_num += io_sectors;
459
        nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
460
        if (s->common.speed) {
461
            delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors_acct);
462
        }
463
    }
464
    return delay_ns;
465
}
466

467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483
static void mirror_free_init(MirrorBlockJob *s)
{
    int granularity = s->granularity;
    size_t buf_size = s->buf_size;
    uint8_t *buf = s->buf;

    assert(s->buf_free_count == 0);
    QSIMPLEQ_INIT(&s->buf_free);
    while (buf_size != 0) {
        MirrorBuffer *cur = (MirrorBuffer *)buf;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
        s->buf_free_count++;
        buf_size -= granularity;
        buf += granularity;
    }
}

484 485 486 487 488
/* This is also used for the .pause callback. There is no matching
 * mirror_resume() because mirror_run() will begin iterating again
 * when the job is resumed.
 */
static void mirror_wait_for_all_io(MirrorBlockJob *s)
489 490
{
    while (s->in_flight > 0) {
F
Fam Zheng 已提交
491
        mirror_wait_for_io(s);
492
    }
P
Paolo Bonzini 已提交
493 494
}

495 496 497 498 499 500 501 502 503
typedef struct {
    int ret;
} MirrorExitData;

static void mirror_exit(BlockJob *job, void *opaque)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
504
    BlockDriverState *src = s->source;
K
Kevin Wolf 已提交
505
    BlockDriverState *target_bs = blk_bs(s->target);
506
    BlockDriverState *mirror_top_bs = s->mirror_top_bs;
507
    Error *local_err = NULL;
508 509 510 511

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
    bdrv_ref(src);
512
    bdrv_ref(mirror_top_bs);
513 514 515 516 517 518 519
    bdrv_ref(target_bs);

    /* Remove target parent that still uses BLK_PERM_WRITE/RESIZE before
     * inserting target_bs at s->to_replace, where we might not be able to get
     * these permissions. */
    blk_unref(s->target);
    s->target = NULL;
520 521 522 523 524 525 526 527

    /* We don't access the source any more. Dropping any WRITE/RESIZE is
     * required before it could become a backing file of target_bs. */
    bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL,
                            &error_abort);
    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
        BlockDriverState *backing = s->is_none_mode ? src : s->base;
        if (backing_bs(target_bs) != backing) {
528 529 530 531 532
            bdrv_set_backing_hd(target_bs, backing, &local_err);
            if (local_err) {
                error_report_err(local_err);
                data->ret = -EPERM;
            }
533 534
        }
    }
535 536 537 538 539 540 541

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);
    }

    if (s->should_complete && data->ret == 0) {
K
Kevin Wolf 已提交
542
        BlockDriverState *to_replace = src;
543 544 545
        if (s->to_replace) {
            to_replace = s->to_replace;
        }
546

K
Kevin Wolf 已提交
547 548
        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
549
        }
550 551 552

        /* The mirror job has no requests in flight any more, but we need to
         * drain potential other users of the BDS before changing the graph. */
K
Kevin Wolf 已提交
553
        bdrv_drained_begin(target_bs);
554
        bdrv_replace_node(to_replace, target_bs, &local_err);
K
Kevin Wolf 已提交
555
        bdrv_drained_end(target_bs);
556 557 558 559
        if (local_err) {
            error_report_err(local_err);
            data->ret = -EPERM;
        }
560 561 562 563 564 565 566 567 568 569
    }
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
        bdrv_unref(s->to_replace);
    }
    if (replace_aio_context) {
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
570
    bdrv_unref(target_bs);
571 572 573

    /* Remove the mirror filter driver from the graph. Before this, get rid of
     * the blockers on the intermediate nodes so that the resulting state is
574 575
     * valid. Also give up permissions on mirror_top_bs->backing, which might
     * block the removal. */
576
    block_job_remove_all_bdrv(job);
577 578
    bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL,
                            &error_abort);
579
    bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort);
580 581

    /* We just changed the BDS the job BB refers to (with either or both of the
582 583
     * bdrv_replace_node() calls), so switch the BB back so the cleanup does
     * the right thing. We don't need any permissions any more now. */
584 585 586 587
    blk_remove_bs(job->blk);
    blk_set_perm(job->blk, 0, BLK_PERM_ALL, &error_abort);
    blk_insert_bs(job->blk, mirror_top_bs, &error_abort);

588
    block_job_completed(&s->common, data->ret);
589

590
    g_free(data);
591
    bdrv_drained_end(src);
592
    bdrv_unref(mirror_top_bs);
593
    bdrv_unref(src);
594 595
}

596 597 598 599 600 601 602 603 604 605 606 607
static void mirror_throttle(MirrorBlockJob *s)
{
    int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);

    if (now - s->last_pause_ns > SLICE_TIME) {
        s->last_pause_ns = now;
        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
    } else {
        block_job_pause_point(&s->common);
    }
}

608 609 610 611
static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
{
    int64_t sector_num, end;
    BlockDriverState *base = s->base;
612
    BlockDriverState *bs = s->source;
613 614 615 616 617
    BlockDriverState *target_bs = blk_bs(s->target);
    int ret, n;

    end = s->bdev_length / BDRV_SECTOR_SIZE;

618
    if (base == NULL && !bdrv_has_zero_init(target_bs)) {
619 620 621 622 623
        if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
            bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, end);
            return 0;
        }

624
        s->initial_zeroing_ongoing = true;
625 626 627 628 629 630 631
        for (sector_num = 0; sector_num < end; ) {
            int nb_sectors = MIN(end - sector_num,
                QEMU_ALIGN_DOWN(INT_MAX, s->granularity) >> BDRV_SECTOR_BITS);

            mirror_throttle(s);

            if (block_job_is_cancelled(&s->common)) {
632
                s->initial_zeroing_ongoing = false;
633 634 635 636
                return 0;
            }

            if (s->in_flight >= MAX_IN_FLIGHT) {
637 638
                trace_mirror_yield(s, UINT64_MAX, s->buf_free_count,
                                   s->in_flight);
639 640 641 642 643 644 645 646
                mirror_wait_for_io(s);
                continue;
            }

            mirror_do_zero_or_discard(s, sector_num, nb_sectors, false);
            sector_num += nb_sectors;
        }

647
        mirror_wait_for_all_io(s);
648
        s->initial_zeroing_ongoing = false;
649 650
    }

651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
    /* First part, loop on the sectors and initialize the dirty bitmap.  */
    for (sector_num = 0; sector_num < end; ) {
        /* Just to make sure we are not exceeding int limit. */
        int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
                             end - sector_num);

        mirror_throttle(s);

        if (block_job_is_cancelled(&s->common)) {
            return 0;
        }

        ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
        if (ret < 0) {
            return ret;
        }

        assert(n > 0);
669
        if (ret == 1) {
670 671 672 673 674 675 676
            bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
        }
        sector_num += n;
    }
    return 0;
}

677 678 679 680 681 682 683 684 685 686 687 688 689 690
/* Called when going out of the streaming phase to flush the bulk of the
 * data to the medium, or just before completing.
 */
static int mirror_flush(MirrorBlockJob *s)
{
    int ret = blk_flush(s->target);
    if (ret < 0) {
        if (mirror_error_action(s, false, -ret) == BLOCK_ERROR_ACTION_REPORT) {
            s->ret = ret;
        }
    }
    return ret;
}

P
Paolo Bonzini 已提交
691 692 693
static void coroutine_fn mirror_run(void *opaque)
{
    MirrorBlockJob *s = opaque;
694
    MirrorExitData *data;
695
    BlockDriverState *bs = s->source;
K
Kevin Wolf 已提交
696
    BlockDriverState *target_bs = blk_bs(s->target);
697
    bool need_drain = true;
698
    int64_t length;
699
    BlockDriverInfo bdi;
700 701
    char backing_filename[2]; /* we only need 2 characters because we are only
                                 checking for a NULL string */
P
Paolo Bonzini 已提交
702
    int ret = 0;
F
Fam Zheng 已提交
703
    int target_cluster_size = BDRV_SECTOR_SIZE;
P
Paolo Bonzini 已提交
704 705 706 707 708

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
    }

M
Max Reitz 已提交
709 710 711
    s->bdev_length = bdrv_getlength(bs);
    if (s->bdev_length < 0) {
        ret = s->bdev_length;
712
        goto immediate_exit;
713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
    }

    /* Active commit must resize the base image if its size differs from the
     * active layer. */
    if (s->base == blk_bs(s->target)) {
        int64_t base_length;

        base_length = blk_getlength(s->target);
        if (base_length < 0) {
            ret = base_length;
            goto immediate_exit;
        }

        if (s->bdev_length > base_length) {
            ret = blk_truncate(s->target, s->bdev_length);
            if (ret < 0) {
                goto immediate_exit;
            }
        }
    }

    if (s->bdev_length == 0) {
735 736 737 738 739 740 741 742
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
        while (!block_job_is_cancelled(&s->common) && !s->should_complete) {
            block_job_yield(&s->common);
        }
        s->common.cancelled = false;
        goto immediate_exit;
P
Paolo Bonzini 已提交
743 744
    }

M
Max Reitz 已提交
745
    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
746 747
    s->in_flight_bitmap = bitmap_new(length);

748 749 750 751
    /* If we have no backing file yet in the destination, we cannot let
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
K
Kevin Wolf 已提交
752
    bdrv_get_backing_filename(target_bs, backing_filename,
753
                              sizeof(backing_filename));
K
Kevin Wolf 已提交
754
    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
F
Fam Zheng 已提交
755 756
        target_cluster_size = bdi.cluster_size;
    }
K
Kevin Wolf 已提交
757
    if (backing_filename[0] && !target_bs->backing
F
Fam Zheng 已提交
758 759 760
        && s->granularity < target_cluster_size) {
        s->buf_size = MAX(s->buf_size, target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
761
    }
F
Fam Zheng 已提交
762
    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
K
Kevin Wolf 已提交
763
    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
764

765 766 767 768 769 770
    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
        ret = -ENOMEM;
        goto immediate_exit;
    }

771
    mirror_free_init(s);
P
Paolo Bonzini 已提交
772

773
    s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
F
Fam Zheng 已提交
774
    if (!s->is_none_mode) {
775 776 777
        ret = mirror_dirty_init(s);
        if (ret < 0 || block_job_is_cancelled(&s->common)) {
            goto immediate_exit;
P
Paolo Bonzini 已提交
778 779 780
        }
    }

781 782
    assert(!s->dbi);
    s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap, 0);
P
Paolo Bonzini 已提交
783
    for (;;) {
784
        uint64_t delay_ns = 0;
785
        int64_t cnt, delta;
P
Paolo Bonzini 已提交
786 787
        bool should_complete;

788 789 790 791 792
        if (s->ret < 0) {
            ret = s->ret;
            goto immediate_exit;
        }

793 794
        block_job_pause_point(&s->common);

795
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
M
Max Reitz 已提交
796 797 798 799 800 801
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
         * s->sectors_in_flight is the number of sectors currently being
         * processed; together those are the current total operation length */
        s->common.len = s->common.offset +
                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
802 803

        /* Note that even when no rate limit is applied we need to yield
804
         * periodically with no pending I/O so that bdrv_drain_all() returns.
805 806 807
         * We do so every SLICE_TIME nanoseconds, or when there is an error,
         * or when the source is clean, whichever comes first.
         */
808 809
        delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
        if (delta < SLICE_TIME &&
810
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
811
            if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
812
                (cnt == 0 && s->in_flight > 0)) {
813
                trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
F
Fam Zheng 已提交
814
                mirror_wait_for_io(s);
815 816
                continue;
            } else if (cnt != 0) {
817
                delay_ns = mirror_iteration(s);
P
Paolo Bonzini 已提交
818 819 820 821
            }
        }

        should_complete = false;
822
        if (s->in_flight == 0 && cnt == 0) {
P
Paolo Bonzini 已提交
823
            trace_mirror_before_flush(s);
824 825 826 827
            if (!s->synced) {
                if (mirror_flush(s) < 0) {
                    /* Go check s->ret.  */
                    continue;
828 829 830 831 832 833
                }
                /* We're out of the streaming phase.  From now on, if the job
                 * is cancelled we will actually complete all pending I/O and
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
834 835
                block_job_event_ready(&s->common);
                s->synced = true;
P
Paolo Bonzini 已提交
836
            }
837 838 839 840

            should_complete = s->should_complete ||
                block_job_is_cancelled(&s->common);
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
841 842 843 844 845 846 847 848 849
        }

        if (cnt == 0 && should_complete) {
            /* The dirty bitmap is not updated while operations are pending.
             * If we're about to exit, wait for pending operations before
             * calling bdrv_get_dirty_count(bs), or we may exit while the
             * source has dirty data to copy!
             *
             * Note that I/O can be submitted by the guest while
850 851 852
             * mirror_populate runs, so pause it now.  Before deciding
             * whether to switch to target check one last time if I/O has
             * come in the meanwhile, and if not flush the data to disk.
P
Paolo Bonzini 已提交
853 854
             */
            trace_mirror_before_drain(s, cnt);
855 856

            bdrv_drained_begin(bs);
857
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
858
            if (cnt > 0 || mirror_flush(s) < 0) {
859 860 861 862 863 864 865 866 867 868 869
                bdrv_drained_end(bs);
                continue;
            }

            /* The two disks are in sync.  Exit and report successful
             * completion.
             */
            assert(QLIST_EMPTY(&bs->tracked_requests));
            s->common.cancelled = false;
            need_drain = false;
            break;
P
Paolo Bonzini 已提交
870 871 872
        }

        ret = 0;
873
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
P
Paolo Bonzini 已提交
874
        if (!s->synced) {
875
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
876 877 878 879
            if (block_job_is_cancelled(&s->common)) {
                break;
            }
        } else if (!should_complete) {
880
            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
881
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
882
        }
883
        s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
P
Paolo Bonzini 已提交
884 885 886
    }

immediate_exit:
887 888 889 890 891 892
    if (s->in_flight > 0) {
        /* We get here only if something went wrong.  Either the job failed,
         * or it was cancelled prematurely so that we do not guarantee that
         * the target is a copy of the source.
         */
        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
893
        assert(need_drain);
894
        mirror_wait_for_all_io(s);
895 896 897
    }

    assert(s->in_flight == 0);
898
    qemu_vfree(s->buf);
899
    g_free(s->cow_bitmap);
900
    g_free(s->in_flight_bitmap);
901
    bdrv_dirty_iter_free(s->dbi);
F
Fam Zheng 已提交
902
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
903 904 905

    data = g_malloc(sizeof(*data));
    data->ret = ret;
906 907 908 909

    if (need_drain) {
        bdrv_drained_begin(bs);
    }
910
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
P
Paolo Bonzini 已提交
911 912 913 914 915 916 917
}

static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    if (speed < 0) {
918
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
P
Paolo Bonzini 已提交
919 920 921 922 923
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

P
Paolo Bonzini 已提交
924 925 926
static void mirror_complete(BlockJob *job, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
927
    BlockDriverState *target;
M
Max Reitz 已提交
928 929

    target = blk_bs(s->target);
P
Paolo Bonzini 已提交
930 931

    if (!s->synced) {
932 933
        error_setg(errp, "The active block job '%s' cannot be completed",
                   job->id);
P
Paolo Bonzini 已提交
934 935 936
        return;
    }

M
Max Reitz 已提交
937 938 939 940 941 942 943 944 945 946
    if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
        int ret;

        assert(!target->backing);
        ret = bdrv_open_backing_file(target, NULL, "backing", errp);
        if (ret < 0) {
            return;
        }
    }

C
Changlong Xie 已提交
947
    /* block all operations on to_replace bs */
948
    if (s->replaces) {
949 950
        AioContext *replace_aio_context;

951
        s->to_replace = bdrv_find_node(s->replaces);
952
        if (!s->to_replace) {
953
            error_setg(errp, "Node name '%s' not found", s->replaces);
954 955 956
            return;
        }

957 958 959
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);

960 961 962 963
        /* TODO Translate this into permission system. Current definition of
         * GRAPH_MOD would require to request it for the parents; they might
         * not even be BlockDriverStates, however, so a BdrvChild can't address
         * them. May need redefinition of GRAPH_MOD. */
964 965 966 967
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
968 969

        aio_context_release(replace_aio_context);
970 971
    }

P
Paolo Bonzini 已提交
972
    s->should_complete = true;
F
Fam Zheng 已提交
973
    block_job_enter(&s->common);
P
Paolo Bonzini 已提交
974 975
}

976
static void mirror_pause(BlockJob *job)
977 978 979
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

980
    mirror_wait_for_all_io(s);
981 982 983 984 985 986 987 988 989
}

static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    blk_set_aio_context(s->target, new_context);
}

990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004
static void mirror_drain(BlockJob *job)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    /* Need to keep a reference in case blk_drain triggers execution
     * of mirror_complete...
     */
    if (s->target) {
        BlockBackend *target = s->target;
        blk_ref(target);
        blk_drain(target);
        blk_unref(target);
    }
}

1005
static const BlockJobDriver mirror_job_driver = {
1006 1007 1008
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_MIRROR,
    .set_speed              = mirror_set_speed,
J
John Snow 已提交
1009
    .start                  = mirror_run,
1010 1011 1012
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
1013
    .drain                  = mirror_drain,
P
Paolo Bonzini 已提交
1014 1015
};

F
Fam Zheng 已提交
1016
static const BlockJobDriver commit_active_job_driver = {
1017 1018 1019
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_COMMIT,
    .set_speed              = mirror_set_speed,
J
John Snow 已提交
1020
    .start                  = mirror_run,
1021 1022 1023
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
1024
    .drain                  = mirror_drain,
F
Fam Zheng 已提交
1025 1026
};

1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065
static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
{
    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
}

static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
{
    return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
}

static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
{
    return bdrv_co_flush(bs->backing->bs);
}

static int64_t coroutine_fn bdrv_mirror_top_get_block_status(
    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
    BlockDriverState **file)
{
    *pnum = nb_sectors;
    *file = bs->backing->bs;
    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
           (sector_num << BDRV_SECTOR_BITS);
}

static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
    int64_t offset, int count, BdrvRequestFlags flags)
{
    return bdrv_co_pwrite_zeroes(bs->backing, offset, count, flags);
}

static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,
    int64_t offset, int count)
{
    return bdrv_co_pdiscard(bs->backing->bs, offset, count);
}

1066 1067 1068 1069 1070 1071 1072
static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs, QDict *opts)
{
    bdrv_refresh_filename(bs->backing->bs);
    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
            bs->backing->bs->filename);
}

1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100
static void bdrv_mirror_top_close(BlockDriverState *bs)
{
}

static void bdrv_mirror_top_child_perm(BlockDriverState *bs, BdrvChild *c,
                                       const BdrvChildRole *role,
                                       uint64_t perm, uint64_t shared,
                                       uint64_t *nperm, uint64_t *nshared)
{
    /* Must be able to forward guest writes to the real image */
    *nperm = 0;
    if (perm & BLK_PERM_WRITE) {
        *nperm |= BLK_PERM_WRITE;
    }

    *nshared = BLK_PERM_ALL;
}

/* Dummy node that provides consistent read to its users without requiring it
 * from its backing file and that allows writes on the backing file chain. */
static BlockDriver bdrv_mirror_top = {
    .format_name                = "mirror_top",
    .bdrv_co_preadv             = bdrv_mirror_top_preadv,
    .bdrv_co_pwritev            = bdrv_mirror_top_pwritev,
    .bdrv_co_pwrite_zeroes      = bdrv_mirror_top_pwrite_zeroes,
    .bdrv_co_pdiscard           = bdrv_mirror_top_pdiscard,
    .bdrv_co_flush              = bdrv_mirror_top_flush,
    .bdrv_co_get_block_status   = bdrv_mirror_top_get_block_status,
1101
    .bdrv_refresh_filename      = bdrv_mirror_top_refresh_filename,
1102 1103 1104 1105
    .bdrv_close                 = bdrv_mirror_top_close,
    .bdrv_child_perm            = bdrv_mirror_top_child_perm,
};

1106
static void mirror_start_job(const char *job_id, BlockDriverState *bs,
1107 1108 1109
                             int creation_flags, BlockDriverState *target,
                             const char *replaces, int64_t speed,
                             uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
1110
                             BlockMirrorBackingMode backing_mode,
1111 1112
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
1113
                             bool unmap,
1114
                             BlockCompletionFunc *cb,
1115
                             void *opaque,
1116
                             const BlockJobDriver *driver,
1117
                             bool is_none_mode, BlockDriverState *base,
1118 1119
                             bool auto_complete, const char *filter_node_name,
                             Error **errp)
P
Paolo Bonzini 已提交
1120 1121
{
    MirrorBlockJob *s;
1122 1123 1124
    BlockDriverState *mirror_top_bs;
    bool target_graph_mod;
    bool target_is_backing;
1125
    Error *local_err = NULL;
1126
    int ret;
P
Paolo Bonzini 已提交
1127

1128
    if (granularity == 0) {
1129
        granularity = bdrv_get_default_bitmap_granularity(target);
1130 1131 1132 1133
    }

    assert ((granularity & (granularity - 1)) == 0);

W
Wen Congyang 已提交
1134 1135 1136 1137 1138 1139 1140 1141
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
    }

    if (buf_size == 0) {
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }
F
Fam Zheng 已提交
1142

1143 1144 1145
    /* In the case of active commit, add dummy driver to provide consistent
     * reads on the top, while disabling it in the intermediate nodes, and make
     * the backing chain writable. */
1146 1147
    mirror_top_bs = bdrv_new_open_driver(&bdrv_mirror_top, filter_node_name,
                                         BDRV_O_RDWR, errp);
1148 1149 1150 1151
    if (mirror_top_bs == NULL) {
        return;
    }
    mirror_top_bs->total_sectors = bs->total_sectors;
1152
    bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs));
1153 1154

    /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
M
Max Reitz 已提交
1155
     * it alive until block_job_create() succeeds even if bs has no parent. */
1156 1157
    bdrv_ref(mirror_top_bs);
    bdrv_drained_begin(bs);
1158
    bdrv_append(mirror_top_bs, bs, &local_err);
1159 1160
    bdrv_drained_end(bs);

1161 1162 1163 1164 1165 1166
    if (local_err) {
        bdrv_unref(mirror_top_bs);
        error_propagate(errp, local_err);
        return;
    }

1167 1168 1169 1170 1171
    /* Make sure that the source is not resized while the job is running */
    s = block_job_create(job_id, driver, mirror_top_bs,
                         BLK_PERM_CONSISTENT_READ,
                         BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
                         BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD, speed,
1172
                         creation_flags, cb, opaque, errp);
P
Paolo Bonzini 已提交
1173
    if (!s) {
1174
        goto fail;
P
Paolo Bonzini 已提交
1175
    }
M
Max Reitz 已提交
1176 1177 1178
    /* The block job now has a reference to this node */
    bdrv_unref(mirror_top_bs);

1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197
    s->source = bs;
    s->mirror_top_bs = mirror_top_bs;

    /* No resize for the target either; while the mirror is still running, a
     * consistent read isn't necessarily possible. We could possibly allow
     * writes and graph modifications, though it would likely defeat the
     * purpose of a mirror, so leave them blocked for now.
     *
     * In the case of active commit, things look a bit different, though,
     * because the target is an already populated backing file in active use.
     * We can allow anything except resize there.*/
    target_is_backing = bdrv_chain_contains(bs, target);
    target_graph_mod = (backing_mode != MIRROR_LEAVE_BACKING_CHAIN);
    s->target = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE |
                        (target_graph_mod ? BLK_PERM_GRAPH_MOD : 0),
                        BLK_PERM_WRITE_UNCHANGED |
                        (target_is_backing ? BLK_PERM_CONSISTENT_READ |
                                             BLK_PERM_WRITE |
                                             BLK_PERM_GRAPH_MOD : 0));
1198 1199
    ret = blk_insert_bs(s->target, target, errp);
    if (ret < 0) {
1200
        goto fail;
1201
    }
K
Kevin Wolf 已提交
1202

1203
    s->replaces = g_strdup(replaces);
1204 1205
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
F
Fam Zheng 已提交
1206
    s->is_none_mode = is_none_mode;
M
Max Reitz 已提交
1207
    s->backing_mode = backing_mode;
F
Fam Zheng 已提交
1208
    s->base = base;
1209
    s->granularity = granularity;
W
Wen Congyang 已提交
1210
    s->buf_size = ROUND_UP(buf_size, granularity);
1211
    s->unmap = unmap;
1212 1213 1214
    if (auto_complete) {
        s->should_complete = true;
    }
1215

1216
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
1217
    if (!s->dirty_bitmap) {
1218
        goto fail;
1219
    }
1220

1221
    /* Required permissions are already taken with blk_new() */
1222 1223 1224
    block_job_add_bdrv(&s->common, "target", target, 0, BLK_PERM_ALL,
                       &error_abort);

1225 1226
    /* In commit_active_start() all intermediate nodes disappear, so
     * any jobs in them must be blocked */
1227
    if (target_is_backing) {
1228 1229
        BlockDriverState *iter;
        for (iter = backing_bs(bs); iter != target; iter = backing_bs(iter)) {
1230 1231 1232 1233 1234 1235 1236 1237 1238 1239
            /* XXX BLK_PERM_WRITE needs to be allowed so we don't block
             * ourselves at s->base (if writes are blocked for a node, they are
             * also blocked for its backing file). The other options would be a
             * second filter driver above s->base (== target). */
            ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
                                     BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
                                     errp);
            if (ret < 0) {
                goto fail;
            }
1240 1241
        }
    }
1242

J
John Snow 已提交
1243 1244
    trace_mirror_start(bs, s, opaque);
    block_job_start(&s->common);
1245 1246 1247 1248
    return;

fail:
    if (s) {
M
Max Reitz 已提交
1249 1250 1251 1252
        /* Make sure this BDS does not go away until we have completed the graph
         * changes below */
        bdrv_ref(mirror_top_bs);

1253 1254 1255 1256 1257
        g_free(s->replaces);
        blk_unref(s->target);
        block_job_unref(&s->common);
    }

1258 1259
    bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL,
                            &error_abort);
1260
    bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort);
M
Max Reitz 已提交
1261 1262

    bdrv_unref(mirror_top_bs);
P
Paolo Bonzini 已提交
1263
}
F
Fam Zheng 已提交
1264

1265 1266
void mirror_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *target, const char *replaces,
1267
                  int64_t speed, uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
1268 1269
                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                  BlockdevOnError on_source_error,
F
Fam Zheng 已提交
1270
                  BlockdevOnError on_target_error,
1271
                  bool unmap, const char *filter_node_name, Error **errp)
F
Fam Zheng 已提交
1272 1273 1274 1275
{
    bool is_none_mode;
    BlockDriverState *base;

1276 1277
    if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        error_setg(errp, "Sync mode 'incremental' not supported");
1278 1279
        return;
    }
F
Fam Zheng 已提交
1280
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
1281
    base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
1282
    mirror_start_job(job_id, bs, BLOCK_JOB_DEFAULT, target, replaces,
M
Max Reitz 已提交
1283
                     speed, granularity, buf_size, backing_mode,
1284
                     on_source_error, on_target_error, unmap, NULL, NULL,
1285
                     &mirror_job_driver, is_none_mode, base, false,
1286
                     filter_node_name, errp);
F
Fam Zheng 已提交
1287 1288
}

1289
void commit_active_start(const char *job_id, BlockDriverState *bs,
1290 1291
                         BlockDriverState *base, int creation_flags,
                         int64_t speed, BlockdevOnError on_error,
1292
                         const char *filter_node_name,
1293
                         BlockCompletionFunc *cb, void *opaque, Error **errp,
1294
                         bool auto_complete)
F
Fam Zheng 已提交
1295
{
1296
    int orig_base_flags;
1297
    Error *local_err = NULL;
1298 1299 1300

    orig_base_flags = bdrv_get_flags(base);

F
Fam Zheng 已提交
1301 1302 1303
    if (bdrv_reopen(base, bs->open_flags, errp)) {
        return;
    }
1304

1305
    mirror_start_job(job_id, bs, creation_flags, base, NULL, speed, 0, 0,
1306
                     MIRROR_LEAVE_BACKING_CHAIN,
1307
                     on_error, on_error, true, cb, opaque,
1308
                     &commit_active_job_driver, false, base, auto_complete,
1309
                     filter_node_name, &local_err);
1310
    if (local_err) {
1311
        error_propagate(errp, local_err);
1312 1313 1314 1315 1316 1317 1318 1319 1320 1321
        goto error_restore_flags;
    }

    return;

error_restore_flags:
    /* ignore error and errp for bdrv_reopen, because we want to propagate
     * the original error */
    bdrv_reopen(base, orig_base_flags, NULL);
    return;
F
Fam Zheng 已提交
1322
}