mirror.c 43.7 KB
Newer Older
P
Paolo Bonzini 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Image mirroring
 *
 * Copyright Red Hat, Inc. 2012
 *
 * Authors:
 *  Paolo Bonzini  <pbonzini@redhat.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
14
#include "qemu/osdep.h"
P
Paolo Bonzini 已提交
15
#include "trace.h"
16
#include "block/blockjob_int.h"
17
#include "block/block_int.h"
18
#include "sysemu/block-backend.h"
19
#include "qapi/error.h"
20
#include "qapi/qmp/qerror.h"
P
Paolo Bonzini 已提交
21
#include "qemu/ratelimit.h"
22
#include "qemu/bitmap.h"
P
Paolo Bonzini 已提交
23

24 25
#define SLICE_TIME    100000000ULL /* ns */
#define MAX_IN_FLIGHT 16
26 27 28
#define MAX_IO_SECTORS ((1 << 20) >> BDRV_SECTOR_BITS) /* 1 Mb */
#define DEFAULT_MIRROR_BUF_SIZE \
    (MAX_IN_FLIGHT * MAX_IO_SECTORS * BDRV_SECTOR_SIZE)
29 30 31 32 33 34 35

/* The mirroring buffer is a list of granularity-sized chunks.
 * Free chunks are organized in a list.
 */
typedef struct MirrorBuffer {
    QSIMPLEQ_ENTRY(MirrorBuffer) next;
} MirrorBuffer;
P
Paolo Bonzini 已提交
36 37 38 39

typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
K
Kevin Wolf 已提交
40
    BlockBackend *target;
41 42
    BlockDriverState *mirror_top_bs;
    BlockDriverState *source;
F
Fam Zheng 已提交
43
    BlockDriverState *base;
44

45 46 47 48 49 50
    /* The name of the graph node to replace */
    char *replaces;
    /* The BDS to replace */
    BlockDriverState *to_replace;
    /* Used to block operations on the drive-mirror-replace target */
    Error *replace_blocker;
F
Fam Zheng 已提交
51
    bool is_none_mode;
M
Max Reitz 已提交
52
    BlockMirrorBackingMode backing_mode;
53
    BlockdevOnError on_source_error, on_target_error;
P
Paolo Bonzini 已提交
54 55
    bool synced;
    bool should_complete;
56
    int64_t granularity;
57
    size_t buf_size;
M
Max Reitz 已提交
58
    int64_t bdev_length;
59
    unsigned long *cow_bitmap;
F
Fam Zheng 已提交
60
    BdrvDirtyBitmap *dirty_bitmap;
61
    BdrvDirtyBitmapIter *dbi;
P
Paolo Bonzini 已提交
62
    uint8_t *buf;
63 64
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;
65

66
    uint64_t last_pause_ns;
67
    unsigned long *in_flight_bitmap;
68
    int in_flight;
69
    int64_t sectors_in_flight;
70
    int ret;
71
    bool unmap;
K
Kevin Wolf 已提交
72
    bool waiting_for_io;
F
Fam Zheng 已提交
73 74
    int target_cluster_sectors;
    int max_iov;
75
    bool initial_zeroing_ongoing;
P
Paolo Bonzini 已提交
76 77
} MirrorBlockJob;

78 79 80 81 82 83 84
typedef struct MirrorOp {
    MirrorBlockJob *s;
    QEMUIOVector qiov;
    int64_t sector_num;
    int nb_sectors;
} MirrorOp;

85 86 87 88 89
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                            int error)
{
    s->synced = false;
    if (read) {
90 91
        return block_job_error_action(&s->common, s->on_source_error,
                                      true, error);
92
    } else {
93 94
        return block_job_error_action(&s->common, s->on_target_error,
                                      false, error);
95 96 97
    }
}

98 99 100
static void mirror_iteration_done(MirrorOp *op, int ret)
{
    MirrorBlockJob *s = op->s;
101
    struct iovec *iov;
102
    int64_t chunk_num;
103
    int i, nb_chunks, sectors_per_chunk;
104 105 106 107

    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
M
Max Reitz 已提交
108
    s->sectors_in_flight -= op->nb_sectors;
109 110 111 112 113 114 115
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
        s->buf_free_count++;
    }

116 117
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    chunk_num = op->sector_num / sectors_per_chunk;
118
    nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk);
119
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
M
Max Reitz 已提交
120 121 122 123
    if (ret >= 0) {
        if (s->cow_bitmap) {
            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
        }
124 125 126
        if (!s->initial_zeroing_ongoing) {
            s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
        }
127
    }
Z
Zhang Min 已提交
128
    qemu_iovec_destroy(&op->qiov);
129
    g_free(op);
130

K
Kevin Wolf 已提交
131
    if (s->waiting_for_io) {
132
        qemu_coroutine_enter(s->common.co);
133
    }
134 135 136 137 138 139
}

static void mirror_write_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
140 141

    aio_context_acquire(blk_get_aio_context(s->common.blk));
142 143 144
    if (ret < 0) {
        BlockErrorAction action;

145
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
146
        action = mirror_error_action(s, false, -ret);
W
Wenchao Xia 已提交
147
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
148 149 150 151
            s->ret = ret;
        }
    }
    mirror_iteration_done(op, ret);
152
    aio_context_release(blk_get_aio_context(s->common.blk));
153 154 155 156 157 158
}

static void mirror_read_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
159 160

    aio_context_acquire(blk_get_aio_context(s->common.blk));
161 162 163
    if (ret < 0) {
        BlockErrorAction action;

164
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
165
        action = mirror_error_action(s, true, -ret);
W
Wenchao Xia 已提交
166
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
167 168 169 170
            s->ret = ret;
        }

        mirror_iteration_done(op, ret);
171 172 173
    } else {
        blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
                        0, mirror_write_complete, op);
174
    }
175
    aio_context_release(blk_get_aio_context(s->common.blk));
176 177
}

178 179 180 181 182 183 184 185
static inline void mirror_clip_sectors(MirrorBlockJob *s,
                                       int64_t sector_num,
                                       int *nb_sectors)
{
    *nb_sectors = MIN(*nb_sectors,
                      s->bdev_length / BDRV_SECTOR_SIZE - sector_num);
}

F
Fam Zheng 已提交
186 187 188 189 190
/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
 * return the offset of the adjusted tail sector against original. */
static int mirror_cow_align(MirrorBlockJob *s,
                            int64_t *sector_num,
                            int *nb_sectors)
P
Paolo Bonzini 已提交
191
{
F
Fam Zheng 已提交
192 193 194 195 196 197 198 199 200 201 202
    bool need_cow;
    int ret = 0;
    int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS;
    int64_t align_sector_num = *sector_num;
    int align_nb_sectors = *nb_sectors;
    int max_sectors = chunk_sectors * s->max_iov;

    need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap);
    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                          s->cow_bitmap);
    if (need_cow) {
203 204 205
        bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
                                       *nb_sectors, &align_sector_num,
                                       &align_nb_sectors);
F
Fam Zheng 已提交
206
    }
207

F
Fam Zheng 已提交
208 209 210 211 212 213
    if (align_nb_sectors > max_sectors) {
        align_nb_sectors = max_sectors;
        if (need_cow) {
            align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors,
                                               s->target_cluster_sectors);
        }
214
    }
215 216 217
    /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but
     * that doesn't matter because it's already the end of source image. */
    mirror_clip_sectors(s, align_sector_num, &align_nb_sectors);
218

F
Fam Zheng 已提交
219 220 221 222 223 224 225
    ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
    *sector_num = align_sector_num;
    *nb_sectors = align_nb_sectors;
    assert(ret >= 0);
    return ret;
}

F
Fam Zheng 已提交
226 227 228 229 230 231 232 233
static inline void mirror_wait_for_io(MirrorBlockJob *s)
{
    assert(!s->waiting_for_io);
    s->waiting_for_io = true;
    qemu_coroutine_yield();
    s->waiting_for_io = false;
}

F
Fam Zheng 已提交
234
/* Submit async read while handling COW.
235 236 237
 * Returns: The number of sectors copied after and including sector_num,
 *          excluding any sectors copied prior to sector_num due to alignment.
 *          This will be nb_sectors if no alignment is necessary, or
F
Fam Zheng 已提交
238 239 240 241 242 243
 *          (new_end - sector_num) if tail is rounded up or down due to
 *          alignment or buffer limit.
 */
static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                          int nb_sectors)
{
K
Kevin Wolf 已提交
244
    BlockBackend *source = s->common.blk;
F
Fam Zheng 已提交
245
    int sectors_per_chunk, nb_chunks;
246
    int ret;
F
Fam Zheng 已提交
247
    MirrorOp *op;
248
    int max_sectors;
F
Fam Zheng 已提交
249

250
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
251
    max_sectors = sectors_per_chunk * s->max_iov;
252

F
Fam Zheng 已提交
253 254
    /* We can only handle as much as buf_size at a time. */
    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
255
    nb_sectors = MIN(max_sectors, nb_sectors);
F
Fam Zheng 已提交
256
    assert(nb_sectors);
257
    ret = nb_sectors;
258

F
Fam Zheng 已提交
259 260 261 262 263 264 265 266
    if (s->cow_bitmap) {
        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
    }
    assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size);
    /* The sector range must meet granularity because:
     * 1) Caller passes in aligned values;
     * 2) mirror_cow_align is used only when target cluster is larger. */
    assert(!(sector_num % sectors_per_chunk));
267
    nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk);
F
Fam Zheng 已提交
268 269

    while (s->buf_free_count < nb_chunks) {
270
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
F
Fam Zheng 已提交
271
        mirror_wait_for_io(s);
272 273
    }

274
    /* Allocate a MirrorOp that is used as an AIO callback.  */
275
    op = g_new(MirrorOp, 1);
276 277 278
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;
279 280 281 282 283 284 285

    /* Now make a QEMUIOVector taking enough granularity-sized chunks
     * from s->buf_free.
     */
    qemu_iovec_init(&op->qiov, nb_chunks);
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
F
Fam Zheng 已提交
286
        size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size;
287

288 289
        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
290
        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
291
    }
292

P
Paolo Bonzini 已提交
293
    /* Copy the dirty cluster.  */
294
    s->in_flight++;
M
Max Reitz 已提交
295
    s->sectors_in_flight += nb_sectors;
296
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
297

298
    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
F
Fam Zheng 已提交
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
                   mirror_read_complete, op);
    return ret;
}

static void mirror_do_zero_or_discard(MirrorBlockJob *s,
                                      int64_t sector_num,
                                      int nb_sectors,
                                      bool is_discard)
{
    MirrorOp *op;

    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
     * so the freeing in mirror_iteration_done is nop. */
    op = g_new0(MirrorOp, 1);
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;

    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    if (is_discard) {
320 321 322
        blk_aio_pdiscard(s->target, sector_num << BDRV_SECTOR_BITS,
                         op->nb_sectors << BDRV_SECTOR_BITS,
                         mirror_write_complete, op);
F
Fam Zheng 已提交
323
    } else {
K
Kevin Wolf 已提交
324 325
        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
                              op->nb_sectors * BDRV_SECTOR_SIZE,
326 327
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
F
Fam Zheng 已提交
328 329 330 331 332
    }
}

static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
{
333
    BlockDriverState *source = s->source;
334
    int64_t sector_num, first_chunk;
F
Fam Zheng 已提交
335 336 337 338 339
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
    int nb_chunks = 1;
    int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
    int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
340
    bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
341 342
    int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT,
                             MAX_IO_SECTORS);
F
Fam Zheng 已提交
343

344
    sector_num = bdrv_dirty_iter_next(s->dbi);
F
Fam Zheng 已提交
345
    if (sector_num < 0) {
346 347
        bdrv_set_dirty_iter(s->dbi, 0);
        sector_num = bdrv_dirty_iter_next(s->dbi);
F
Fam Zheng 已提交
348 349 350 351
        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
        assert(sector_num >= 0);
    }

352 353
    first_chunk = sector_num / sectors_per_chunk;
    while (test_bit(first_chunk, s->in_flight_bitmap)) {
354
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
355 356 357
        mirror_wait_for_io(s);
    }

358 359
    block_job_pause_point(&s->common);

F
Fam Zheng 已提交
360 361 362
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
363
        int64_t next_dirty;
F
Fam Zheng 已提交
364 365 366 367 368 369 370
        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
        int64_t next_chunk = next_sector / sectors_per_chunk;
        if (next_sector >= end ||
            !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
            break;
        }
        if (test_bit(next_chunk, s->in_flight_bitmap)) {
371
            break;
F
Fam Zheng 已提交
372
        }
373

374 375
        next_dirty = bdrv_dirty_iter_next(s->dbi);
        if (next_dirty > next_sector || next_dirty < 0) {
376
            /* The bitmap iterator's cache is stale, refresh it */
377 378
            bdrv_set_dirty_iter(s->dbi, next_sector);
            next_dirty = bdrv_dirty_iter_next(s->dbi);
379
        }
380
        assert(next_dirty == next_sector);
381
        nb_chunks++;
F
Fam Zheng 已提交
382 383 384 385 386 387 388 389 390 391
    }

    /* Clear dirty bits before querying the block status, because
     * calling bdrv_get_block_status_above could yield - if some blocks are
     * marked dirty in this window, we need to know.
     */
    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num,
                            nb_chunks * sectors_per_chunk);
    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
    while (nb_chunks > 0 && sector_num < end) {
392
        int64_t ret;
393
        int io_sectors, io_sectors_acct;
F
Fam Zheng 已提交
394 395 396 397 398 399 400 401 402 403 404 405
        BlockDriverState *file;
        enum MirrorMethod {
            MIRROR_METHOD_COPY,
            MIRROR_METHOD_ZERO,
            MIRROR_METHOD_DISCARD
        } mirror_method = MIRROR_METHOD_COPY;

        assert(!(sector_num % sectors_per_chunk));
        ret = bdrv_get_block_status_above(source, NULL, sector_num,
                                          nb_chunks * sectors_per_chunk,
                                          &io_sectors, &file);
        if (ret < 0) {
406 407 408
            io_sectors = MIN(nb_chunks * sectors_per_chunk, max_io_sectors);
        } else if (ret & BDRV_BLOCK_DATA) {
            io_sectors = MIN(io_sectors, max_io_sectors);
F
Fam Zheng 已提交
409 410 411 412 413 414 415 416
        }

        io_sectors -= io_sectors % sectors_per_chunk;
        if (io_sectors < sectors_per_chunk) {
            io_sectors = sectors_per_chunk;
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_sector_num;
            int target_nb_sectors;
417 418 419
            bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
                                           io_sectors,  &target_sector_num,
                                           &target_nb_sectors);
F
Fam Zheng 已提交
420 421 422 423 424 425 426 427
            if (target_sector_num == sector_num &&
                target_nb_sectors == io_sectors) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
                                    MIRROR_METHOD_ZERO :
                                    MIRROR_METHOD_DISCARD;
            }
        }

428 429 430 431 432
        while (s->in_flight >= MAX_IN_FLIGHT) {
            trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
            mirror_wait_for_io(s);
        }

433 434 435 436
        if (s->ret < 0) {
            return 0;
        }

437
        mirror_clip_sectors(s, sector_num, &io_sectors);
F
Fam Zheng 已提交
438 439 440
        switch (mirror_method) {
        case MIRROR_METHOD_COPY:
            io_sectors = mirror_do_read(s, sector_num, io_sectors);
441
            io_sectors_acct = io_sectors;
F
Fam Zheng 已提交
442 443 444
            break;
        case MIRROR_METHOD_ZERO:
        case MIRROR_METHOD_DISCARD:
445 446 447 448 449 450 451
            mirror_do_zero_or_discard(s, sector_num, io_sectors,
                                      mirror_method == MIRROR_METHOD_DISCARD);
            if (write_zeroes_ok) {
                io_sectors_acct = 0;
            } else {
                io_sectors_acct = io_sectors;
            }
F
Fam Zheng 已提交
452 453 454 455 456 457
            break;
        default:
            abort();
        }
        assert(io_sectors);
        sector_num += io_sectors;
458
        nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
459
        if (s->common.speed) {
460
            delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors_acct);
461
        }
462
    }
463
    return delay_ns;
464
}
465

466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
static void mirror_free_init(MirrorBlockJob *s)
{
    int granularity = s->granularity;
    size_t buf_size = s->buf_size;
    uint8_t *buf = s->buf;

    assert(s->buf_free_count == 0);
    QSIMPLEQ_INIT(&s->buf_free);
    while (buf_size != 0) {
        MirrorBuffer *cur = (MirrorBuffer *)buf;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
        s->buf_free_count++;
        buf_size -= granularity;
        buf += granularity;
    }
}

483 484 485 486 487
/* This is also used for the .pause callback. There is no matching
 * mirror_resume() because mirror_run() will begin iterating again
 * when the job is resumed.
 */
static void mirror_wait_for_all_io(MirrorBlockJob *s)
488 489
{
    while (s->in_flight > 0) {
F
Fam Zheng 已提交
490
        mirror_wait_for_io(s);
491
    }
P
Paolo Bonzini 已提交
492 493
}

494 495 496 497 498 499 500 501 502
typedef struct {
    int ret;
} MirrorExitData;

static void mirror_exit(BlockJob *job, void *opaque)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
503
    BlockDriverState *src = s->source;
K
Kevin Wolf 已提交
504
    BlockDriverState *target_bs = blk_bs(s->target);
505
    BlockDriverState *mirror_top_bs = s->mirror_top_bs;
506
    Error *local_err = NULL;
507 508 509 510

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
    bdrv_ref(src);
511
    bdrv_ref(mirror_top_bs);
512 513 514 515 516 517 518
    bdrv_ref(target_bs);

    /* Remove target parent that still uses BLK_PERM_WRITE/RESIZE before
     * inserting target_bs at s->to_replace, where we might not be able to get
     * these permissions. */
    blk_unref(s->target);
    s->target = NULL;
519 520 521 522 523 524 525 526

    /* We don't access the source any more. Dropping any WRITE/RESIZE is
     * required before it could become a backing file of target_bs. */
    bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL,
                            &error_abort);
    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
        BlockDriverState *backing = s->is_none_mode ? src : s->base;
        if (backing_bs(target_bs) != backing) {
527 528 529 530 531
            bdrv_set_backing_hd(target_bs, backing, &local_err);
            if (local_err) {
                error_report_err(local_err);
                data->ret = -EPERM;
            }
532 533
        }
    }
534 535 536 537 538 539 540

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);
    }

    if (s->should_complete && data->ret == 0) {
K
Kevin Wolf 已提交
541
        BlockDriverState *to_replace = src;
542 543 544
        if (s->to_replace) {
            to_replace = s->to_replace;
        }
545

K
Kevin Wolf 已提交
546 547
        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
548
        }
549 550 551

        /* The mirror job has no requests in flight any more, but we need to
         * drain potential other users of the BDS before changing the graph. */
K
Kevin Wolf 已提交
552 553 554
        bdrv_drained_begin(target_bs);
        bdrv_replace_in_backing_chain(to_replace, target_bs);
        bdrv_drained_end(target_bs);
555 556 557 558 559 560 561 562 563 564
    }
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
        bdrv_unref(s->to_replace);
    }
    if (replace_aio_context) {
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
565
    bdrv_unref(target_bs);
566 567 568

    /* Remove the mirror filter driver from the graph. Before this, get rid of
     * the blockers on the intermediate nodes so that the resulting state is
569 570
     * valid. Also give up permissions on mirror_top_bs->backing, which might
     * block the removal. */
571
    block_job_remove_all_bdrv(job);
572
    bdrv_child_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL);
573 574 575 576 577 578 579 580 581 582
    bdrv_replace_in_backing_chain(mirror_top_bs, backing_bs(mirror_top_bs));

    /* We just changed the BDS the job BB refers to (with either or both of the
     * bdrv_replace_in_backing_chain() calls), so switch the BB back so the
     * cleanup does the right thing. We don't need any permissions any more
     * now. */
    blk_remove_bs(job->blk);
    blk_set_perm(job->blk, 0, BLK_PERM_ALL, &error_abort);
    blk_insert_bs(job->blk, mirror_top_bs, &error_abort);

583
    block_job_completed(&s->common, data->ret);
584

585
    g_free(data);
586
    bdrv_drained_end(src);
587
    bdrv_unref(mirror_top_bs);
588
    bdrv_unref(src);
589 590
}

591 592 593 594 595 596 597 598 599 600 601 602
static void mirror_throttle(MirrorBlockJob *s)
{
    int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);

    if (now - s->last_pause_ns > SLICE_TIME) {
        s->last_pause_ns = now;
        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
    } else {
        block_job_pause_point(&s->common);
    }
}

603 604 605 606
static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
{
    int64_t sector_num, end;
    BlockDriverState *base = s->base;
607
    BlockDriverState *bs = s->source;
608 609 610 611 612
    BlockDriverState *target_bs = blk_bs(s->target);
    int ret, n;

    end = s->bdev_length / BDRV_SECTOR_SIZE;

613
    if (base == NULL && !bdrv_has_zero_init(target_bs)) {
614 615 616 617 618
        if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
            bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, end);
            return 0;
        }

619
        s->initial_zeroing_ongoing = true;
620 621 622 623 624 625 626
        for (sector_num = 0; sector_num < end; ) {
            int nb_sectors = MIN(end - sector_num,
                QEMU_ALIGN_DOWN(INT_MAX, s->granularity) >> BDRV_SECTOR_BITS);

            mirror_throttle(s);

            if (block_job_is_cancelled(&s->common)) {
627
                s->initial_zeroing_ongoing = false;
628 629 630 631 632 633 634 635 636 637 638 639 640
                return 0;
            }

            if (s->in_flight >= MAX_IN_FLIGHT) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, -1);
                mirror_wait_for_io(s);
                continue;
            }

            mirror_do_zero_or_discard(s, sector_num, nb_sectors, false);
            sector_num += nb_sectors;
        }

641
        mirror_wait_for_all_io(s);
642
        s->initial_zeroing_ongoing = false;
643 644
    }

645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662
    /* First part, loop on the sectors and initialize the dirty bitmap.  */
    for (sector_num = 0; sector_num < end; ) {
        /* Just to make sure we are not exceeding int limit. */
        int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
                             end - sector_num);

        mirror_throttle(s);

        if (block_job_is_cancelled(&s->common)) {
            return 0;
        }

        ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
        if (ret < 0) {
            return ret;
        }

        assert(n > 0);
663
        if (ret == 1) {
664 665 666 667 668 669 670
            bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
        }
        sector_num += n;
    }
    return 0;
}

671 672 673 674 675 676 677 678 679 680 681 682 683 684
/* Called when going out of the streaming phase to flush the bulk of the
 * data to the medium, or just before completing.
 */
static int mirror_flush(MirrorBlockJob *s)
{
    int ret = blk_flush(s->target);
    if (ret < 0) {
        if (mirror_error_action(s, false, -ret) == BLOCK_ERROR_ACTION_REPORT) {
            s->ret = ret;
        }
    }
    return ret;
}

P
Paolo Bonzini 已提交
685 686 687
static void coroutine_fn mirror_run(void *opaque)
{
    MirrorBlockJob *s = opaque;
688
    MirrorExitData *data;
689
    BlockDriverState *bs = s->source;
K
Kevin Wolf 已提交
690
    BlockDriverState *target_bs = blk_bs(s->target);
691
    bool need_drain = true;
692
    int64_t length;
693
    BlockDriverInfo bdi;
694 695
    char backing_filename[2]; /* we only need 2 characters because we are only
                                 checking for a NULL string */
P
Paolo Bonzini 已提交
696
    int ret = 0;
F
Fam Zheng 已提交
697
    int target_cluster_size = BDRV_SECTOR_SIZE;
P
Paolo Bonzini 已提交
698 699 700 701 702

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
    }

M
Max Reitz 已提交
703 704 705
    s->bdev_length = bdrv_getlength(bs);
    if (s->bdev_length < 0) {
        ret = s->bdev_length;
706
        goto immediate_exit;
707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
    }

    /* Active commit must resize the base image if its size differs from the
     * active layer. */
    if (s->base == blk_bs(s->target)) {
        int64_t base_length;

        base_length = blk_getlength(s->target);
        if (base_length < 0) {
            ret = base_length;
            goto immediate_exit;
        }

        if (s->bdev_length > base_length) {
            ret = blk_truncate(s->target, s->bdev_length);
            if (ret < 0) {
                goto immediate_exit;
            }
        }
    }

    if (s->bdev_length == 0) {
729 730 731 732 733 734 735 736
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
        while (!block_job_is_cancelled(&s->common) && !s->should_complete) {
            block_job_yield(&s->common);
        }
        s->common.cancelled = false;
        goto immediate_exit;
P
Paolo Bonzini 已提交
737 738
    }

M
Max Reitz 已提交
739
    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
740 741
    s->in_flight_bitmap = bitmap_new(length);

742 743 744 745
    /* If we have no backing file yet in the destination, we cannot let
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
K
Kevin Wolf 已提交
746
    bdrv_get_backing_filename(target_bs, backing_filename,
747
                              sizeof(backing_filename));
K
Kevin Wolf 已提交
748
    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
F
Fam Zheng 已提交
749 750
        target_cluster_size = bdi.cluster_size;
    }
K
Kevin Wolf 已提交
751
    if (backing_filename[0] && !target_bs->backing
F
Fam Zheng 已提交
752 753 754
        && s->granularity < target_cluster_size) {
        s->buf_size = MAX(s->buf_size, target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
755
    }
F
Fam Zheng 已提交
756
    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
K
Kevin Wolf 已提交
757
    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
758

759 760 761 762 763 764
    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
        ret = -ENOMEM;
        goto immediate_exit;
    }

765
    mirror_free_init(s);
P
Paolo Bonzini 已提交
766

767
    s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
F
Fam Zheng 已提交
768
    if (!s->is_none_mode) {
769 770 771
        ret = mirror_dirty_init(s);
        if (ret < 0 || block_job_is_cancelled(&s->common)) {
            goto immediate_exit;
P
Paolo Bonzini 已提交
772 773 774
        }
    }

775 776
    assert(!s->dbi);
    s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap, 0);
P
Paolo Bonzini 已提交
777
    for (;;) {
778
        uint64_t delay_ns = 0;
779
        int64_t cnt, delta;
P
Paolo Bonzini 已提交
780 781
        bool should_complete;

782 783 784 785 786
        if (s->ret < 0) {
            ret = s->ret;
            goto immediate_exit;
        }

787 788
        block_job_pause_point(&s->common);

789
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
M
Max Reitz 已提交
790 791 792 793 794 795
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
         * s->sectors_in_flight is the number of sectors currently being
         * processed; together those are the current total operation length */
        s->common.len = s->common.offset +
                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
796 797

        /* Note that even when no rate limit is applied we need to yield
798
         * periodically with no pending I/O so that bdrv_drain_all() returns.
799 800 801
         * We do so every SLICE_TIME nanoseconds, or when there is an error,
         * or when the source is clean, whichever comes first.
         */
802 803
        delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
        if (delta < SLICE_TIME &&
804
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
805
            if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
806 807
                (cnt == 0 && s->in_flight > 0)) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
F
Fam Zheng 已提交
808
                mirror_wait_for_io(s);
809 810
                continue;
            } else if (cnt != 0) {
811
                delay_ns = mirror_iteration(s);
P
Paolo Bonzini 已提交
812 813 814 815
            }
        }

        should_complete = false;
816
        if (s->in_flight == 0 && cnt == 0) {
P
Paolo Bonzini 已提交
817
            trace_mirror_before_flush(s);
818 819 820 821
            if (!s->synced) {
                if (mirror_flush(s) < 0) {
                    /* Go check s->ret.  */
                    continue;
822 823 824 825 826 827
                }
                /* We're out of the streaming phase.  From now on, if the job
                 * is cancelled we will actually complete all pending I/O and
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
828 829
                block_job_event_ready(&s->common);
                s->synced = true;
P
Paolo Bonzini 已提交
830
            }
831 832 833 834

            should_complete = s->should_complete ||
                block_job_is_cancelled(&s->common);
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
835 836 837 838 839 840 841 842 843
        }

        if (cnt == 0 && should_complete) {
            /* The dirty bitmap is not updated while operations are pending.
             * If we're about to exit, wait for pending operations before
             * calling bdrv_get_dirty_count(bs), or we may exit while the
             * source has dirty data to copy!
             *
             * Note that I/O can be submitted by the guest while
844 845 846
             * mirror_populate runs, so pause it now.  Before deciding
             * whether to switch to target check one last time if I/O has
             * come in the meanwhile, and if not flush the data to disk.
P
Paolo Bonzini 已提交
847 848
             */
            trace_mirror_before_drain(s, cnt);
849 850

            bdrv_drained_begin(bs);
851
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
852
            if (cnt > 0 || mirror_flush(s) < 0) {
853 854 855 856 857 858 859 860 861 862 863
                bdrv_drained_end(bs);
                continue;
            }

            /* The two disks are in sync.  Exit and report successful
             * completion.
             */
            assert(QLIST_EMPTY(&bs->tracked_requests));
            s->common.cancelled = false;
            need_drain = false;
            break;
P
Paolo Bonzini 已提交
864 865 866
        }

        ret = 0;
867
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
P
Paolo Bonzini 已提交
868
        if (!s->synced) {
869
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
870 871 872 873
            if (block_job_is_cancelled(&s->common)) {
                break;
            }
        } else if (!should_complete) {
874
            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
875
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
876
        }
877
        s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
P
Paolo Bonzini 已提交
878 879 880
    }

immediate_exit:
881 882 883 884 885 886
    if (s->in_flight > 0) {
        /* We get here only if something went wrong.  Either the job failed,
         * or it was cancelled prematurely so that we do not guarantee that
         * the target is a copy of the source.
         */
        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
887
        assert(need_drain);
888
        mirror_wait_for_all_io(s);
889 890 891
    }

    assert(s->in_flight == 0);
892
    qemu_vfree(s->buf);
893
    g_free(s->cow_bitmap);
894
    g_free(s->in_flight_bitmap);
895
    bdrv_dirty_iter_free(s->dbi);
F
Fam Zheng 已提交
896
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
897 898 899

    data = g_malloc(sizeof(*data));
    data->ret = ret;
900 901 902 903

    if (need_drain) {
        bdrv_drained_begin(bs);
    }
904
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
P
Paolo Bonzini 已提交
905 906 907 908 909 910 911
}

static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    if (speed < 0) {
912
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
P
Paolo Bonzini 已提交
913 914 915 916 917
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

P
Paolo Bonzini 已提交
918 919 920
static void mirror_complete(BlockJob *job, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
921
    BlockDriverState *target;
M
Max Reitz 已提交
922 923

    target = blk_bs(s->target);
P
Paolo Bonzini 已提交
924 925

    if (!s->synced) {
926 927
        error_setg(errp, "The active block job '%s' cannot be completed",
                   job->id);
P
Paolo Bonzini 已提交
928 929 930
        return;
    }

M
Max Reitz 已提交
931 932 933 934 935 936 937 938 939 940
    if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
        int ret;

        assert(!target->backing);
        ret = bdrv_open_backing_file(target, NULL, "backing", errp);
        if (ret < 0) {
            return;
        }
    }

C
Changlong Xie 已提交
941
    /* block all operations on to_replace bs */
942
    if (s->replaces) {
943 944
        AioContext *replace_aio_context;

945
        s->to_replace = bdrv_find_node(s->replaces);
946
        if (!s->to_replace) {
947
            error_setg(errp, "Node name '%s' not found", s->replaces);
948 949 950
            return;
        }

951 952 953
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);

954 955 956 957
        /* TODO Translate this into permission system. Current definition of
         * GRAPH_MOD would require to request it for the parents; they might
         * not even be BlockDriverStates, however, so a BdrvChild can't address
         * them. May need redefinition of GRAPH_MOD. */
958 959 960 961
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
962 963

        aio_context_release(replace_aio_context);
964 965
    }

P
Paolo Bonzini 已提交
966
    s->should_complete = true;
F
Fam Zheng 已提交
967
    block_job_enter(&s->common);
P
Paolo Bonzini 已提交
968 969
}

970
static void mirror_pause(BlockJob *job)
971 972 973
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

974
    mirror_wait_for_all_io(s);
975 976 977 978 979 980 981 982 983
}

static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    blk_set_aio_context(s->target, new_context);
}

984 985 986 987 988 989 990 991 992 993 994 995 996 997 998
static void mirror_drain(BlockJob *job)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    /* Need to keep a reference in case blk_drain triggers execution
     * of mirror_complete...
     */
    if (s->target) {
        BlockBackend *target = s->target;
        blk_ref(target);
        blk_drain(target);
        blk_unref(target);
    }
}

999
static const BlockJobDriver mirror_job_driver = {
1000 1001 1002
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_MIRROR,
    .set_speed              = mirror_set_speed,
J
John Snow 已提交
1003
    .start                  = mirror_run,
1004 1005 1006
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
1007
    .drain                  = mirror_drain,
P
Paolo Bonzini 已提交
1008 1009
};

F
Fam Zheng 已提交
1010
static const BlockJobDriver commit_active_job_driver = {
1011 1012 1013
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_COMMIT,
    .set_speed              = mirror_set_speed,
J
John Snow 已提交
1014
    .start                  = mirror_run,
1015 1016 1017
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
1018
    .drain                  = mirror_drain,
F
Fam Zheng 已提交
1019 1020
};

1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
{
    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
}

static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
{
    return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
}

static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
{
    return bdrv_co_flush(bs->backing->bs);
}

static int64_t coroutine_fn bdrv_mirror_top_get_block_status(
    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
    BlockDriverState **file)
{
    *pnum = nb_sectors;
    *file = bs->backing->bs;
    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
           (sector_num << BDRV_SECTOR_BITS);
}

static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
    int64_t offset, int count, BdrvRequestFlags flags)
{
    return bdrv_co_pwrite_zeroes(bs->backing, offset, count, flags);
}

static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,
    int64_t offset, int count)
{
    return bdrv_co_pdiscard(bs->backing->bs, offset, count);
}

static void bdrv_mirror_top_close(BlockDriverState *bs)
{
}

static void bdrv_mirror_top_child_perm(BlockDriverState *bs, BdrvChild *c,
                                       const BdrvChildRole *role,
                                       uint64_t perm, uint64_t shared,
                                       uint64_t *nperm, uint64_t *nshared)
{
    /* Must be able to forward guest writes to the real image */
    *nperm = 0;
    if (perm & BLK_PERM_WRITE) {
        *nperm |= BLK_PERM_WRITE;
    }

    *nshared = BLK_PERM_ALL;
}

/* Dummy node that provides consistent read to its users without requiring it
 * from its backing file and that allows writes on the backing file chain. */
static BlockDriver bdrv_mirror_top = {
    .format_name                = "mirror_top",
    .bdrv_co_preadv             = bdrv_mirror_top_preadv,
    .bdrv_co_pwritev            = bdrv_mirror_top_pwritev,
    .bdrv_co_pwrite_zeroes      = bdrv_mirror_top_pwrite_zeroes,
    .bdrv_co_pdiscard           = bdrv_mirror_top_pdiscard,
    .bdrv_co_flush              = bdrv_mirror_top_flush,
    .bdrv_co_get_block_status   = bdrv_mirror_top_get_block_status,
    .bdrv_close                 = bdrv_mirror_top_close,
    .bdrv_child_perm            = bdrv_mirror_top_child_perm,
};

1092
static void mirror_start_job(const char *job_id, BlockDriverState *bs,
1093 1094 1095
                             int creation_flags, BlockDriverState *target,
                             const char *replaces, int64_t speed,
                             uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
1096
                             BlockMirrorBackingMode backing_mode,
1097 1098
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
1099
                             bool unmap,
1100
                             BlockCompletionFunc *cb,
1101 1102
                             void *opaque, Error **errp,
                             const BlockJobDriver *driver,
1103
                             bool is_none_mode, BlockDriverState *base,
1104
                             bool auto_complete, const char *filter_node_name)
P
Paolo Bonzini 已提交
1105 1106
{
    MirrorBlockJob *s;
1107 1108 1109
    BlockDriverState *mirror_top_bs;
    bool target_graph_mod;
    bool target_is_backing;
1110
    Error *local_err = NULL;
1111
    int ret;
P
Paolo Bonzini 已提交
1112

1113
    if (granularity == 0) {
1114
        granularity = bdrv_get_default_bitmap_granularity(target);
1115 1116 1117 1118
    }

    assert ((granularity & (granularity - 1)) == 0);

W
Wen Congyang 已提交
1119 1120 1121 1122 1123 1124 1125 1126
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
    }

    if (buf_size == 0) {
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }
F
Fam Zheng 已提交
1127

1128 1129 1130
    /* In the case of active commit, add dummy driver to provide consistent
     * reads on the top, while disabling it in the intermediate nodes, and make
     * the backing chain writable. */
1131 1132
    mirror_top_bs = bdrv_new_open_driver(&bdrv_mirror_top, filter_node_name,
                                         BDRV_O_RDWR, errp);
1133 1134 1135 1136 1137 1138 1139 1140 1141
    if (mirror_top_bs == NULL) {
        return;
    }
    mirror_top_bs->total_sectors = bs->total_sectors;

    /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
     * it alive until block_job_create() even if bs has no parent. */
    bdrv_ref(mirror_top_bs);
    bdrv_drained_begin(bs);
1142
    bdrv_append(mirror_top_bs, bs, &local_err);
1143 1144
    bdrv_drained_end(bs);

1145 1146 1147 1148 1149 1150
    if (local_err) {
        bdrv_unref(mirror_top_bs);
        error_propagate(errp, local_err);
        return;
    }

1151 1152 1153 1154 1155
    /* Make sure that the source is not resized while the job is running */
    s = block_job_create(job_id, driver, mirror_top_bs,
                         BLK_PERM_CONSISTENT_READ,
                         BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
                         BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD, speed,
1156
                         creation_flags, cb, opaque, errp);
1157
    bdrv_unref(mirror_top_bs);
P
Paolo Bonzini 已提交
1158
    if (!s) {
1159
        goto fail;
P
Paolo Bonzini 已提交
1160
    }
1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
    s->source = bs;
    s->mirror_top_bs = mirror_top_bs;

    /* No resize for the target either; while the mirror is still running, a
     * consistent read isn't necessarily possible. We could possibly allow
     * writes and graph modifications, though it would likely defeat the
     * purpose of a mirror, so leave them blocked for now.
     *
     * In the case of active commit, things look a bit different, though,
     * because the target is an already populated backing file in active use.
     * We can allow anything except resize there.*/
    target_is_backing = bdrv_chain_contains(bs, target);
    target_graph_mod = (backing_mode != MIRROR_LEAVE_BACKING_CHAIN);
    s->target = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE |
                        (target_graph_mod ? BLK_PERM_GRAPH_MOD : 0),
                        BLK_PERM_WRITE_UNCHANGED |
                        (target_is_backing ? BLK_PERM_CONSISTENT_READ |
                                             BLK_PERM_WRITE |
                                             BLK_PERM_GRAPH_MOD : 0));
1180 1181
    ret = blk_insert_bs(s->target, target, errp);
    if (ret < 0) {
1182
        goto fail;
1183
    }
K
Kevin Wolf 已提交
1184

1185
    s->replaces = g_strdup(replaces);
1186 1187
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
F
Fam Zheng 已提交
1188
    s->is_none_mode = is_none_mode;
M
Max Reitz 已提交
1189
    s->backing_mode = backing_mode;
F
Fam Zheng 已提交
1190
    s->base = base;
1191
    s->granularity = granularity;
W
Wen Congyang 已提交
1192
    s->buf_size = ROUND_UP(buf_size, granularity);
1193
    s->unmap = unmap;
1194 1195 1196
    if (auto_complete) {
        s->should_complete = true;
    }
1197

1198
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
1199
    if (!s->dirty_bitmap) {
1200
        goto fail;
1201
    }
1202

1203
    /* Required permissions are already taken with blk_new() */
1204 1205 1206
    block_job_add_bdrv(&s->common, "target", target, 0, BLK_PERM_ALL,
                       &error_abort);

1207 1208
    /* In commit_active_start() all intermediate nodes disappear, so
     * any jobs in them must be blocked */
1209
    if (target_is_backing) {
1210 1211
        BlockDriverState *iter;
        for (iter = backing_bs(bs); iter != target; iter = backing_bs(iter)) {
1212 1213 1214 1215 1216 1217 1218 1219 1220 1221
            /* XXX BLK_PERM_WRITE needs to be allowed so we don't block
             * ourselves at s->base (if writes are blocked for a node, they are
             * also blocked for its backing file). The other options would be a
             * second filter driver above s->base (== target). */
            ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
                                     BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
                                     errp);
            if (ret < 0) {
                goto fail;
            }
1222 1223
        }
    }
1224

J
John Snow 已提交
1225 1226
    trace_mirror_start(bs, s, opaque);
    block_job_start(&s->common);
1227 1228 1229 1230 1231 1232 1233 1234 1235
    return;

fail:
    if (s) {
        g_free(s->replaces);
        blk_unref(s->target);
        block_job_unref(&s->common);
    }

1236
    bdrv_child_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL);
1237
    bdrv_replace_in_backing_chain(mirror_top_bs, backing_bs(mirror_top_bs));
P
Paolo Bonzini 已提交
1238
}
F
Fam Zheng 已提交
1239

1240 1241
void mirror_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *target, const char *replaces,
1242
                  int64_t speed, uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
1243 1244
                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                  BlockdevOnError on_source_error,
F
Fam Zheng 已提交
1245
                  BlockdevOnError on_target_error,
1246
                  bool unmap, const char *filter_node_name, Error **errp)
F
Fam Zheng 已提交
1247 1248 1249 1250
{
    bool is_none_mode;
    BlockDriverState *base;

1251 1252
    if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        error_setg(errp, "Sync mode 'incremental' not supported");
1253 1254
        return;
    }
F
Fam Zheng 已提交
1255
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
1256
    base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
1257
    mirror_start_job(job_id, bs, BLOCK_JOB_DEFAULT, target, replaces,
M
Max Reitz 已提交
1258
                     speed, granularity, buf_size, backing_mode,
1259
                     on_source_error, on_target_error, unmap, NULL, NULL, errp,
1260 1261
                     &mirror_job_driver, is_none_mode, base, false,
                     filter_node_name);
F
Fam Zheng 已提交
1262 1263
}

1264
void commit_active_start(const char *job_id, BlockDriverState *bs,
1265 1266
                         BlockDriverState *base, int creation_flags,
                         int64_t speed, BlockdevOnError on_error,
1267
                         const char *filter_node_name,
1268
                         BlockCompletionFunc *cb, void *opaque, Error **errp,
1269
                         bool auto_complete)
F
Fam Zheng 已提交
1270
{
1271
    int orig_base_flags;
1272
    Error *local_err = NULL;
1273 1274 1275

    orig_base_flags = bdrv_get_flags(base);

F
Fam Zheng 已提交
1276 1277 1278
    if (bdrv_reopen(base, bs->open_flags, errp)) {
        return;
    }
1279

1280
    mirror_start_job(job_id, bs, creation_flags, base, NULL, speed, 0, 0,
1281
                     MIRROR_LEAVE_BACKING_CHAIN,
1282
                     on_error, on_error, true, cb, opaque, &local_err,
1283
                     &commit_active_job_driver, false, base, auto_complete,
1284
                     filter_node_name);
1285
    if (local_err) {
1286
        error_propagate(errp, local_err);
1287 1288 1289 1290 1291 1292 1293 1294 1295 1296
        goto error_restore_flags;
    }

    return;

error_restore_flags:
    /* ignore error and errp for bdrv_reopen, because we want to propagate
     * the original error */
    bdrv_reopen(base, orig_base_flags, NULL);
    return;
F
Fam Zheng 已提交
1297
}