mirror.c 43.4 KB
Newer Older
P
Paolo Bonzini 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Image mirroring
 *
 * Copyright Red Hat, Inc. 2012
 *
 * Authors:
 *  Paolo Bonzini  <pbonzini@redhat.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
14
#include "qemu/osdep.h"
P
Paolo Bonzini 已提交
15
#include "trace.h"
16
#include "block/blockjob_int.h"
17
#include "block/block_int.h"
18
#include "sysemu/block-backend.h"
19
#include "qapi/error.h"
20
#include "qapi/qmp/qerror.h"
P
Paolo Bonzini 已提交
21
#include "qemu/ratelimit.h"
22
#include "qemu/bitmap.h"
P
Paolo Bonzini 已提交
23

24 25
#define SLICE_TIME    100000000ULL /* ns */
#define MAX_IN_FLIGHT 16
26 27 28
#define MAX_IO_SECTORS ((1 << 20) >> BDRV_SECTOR_BITS) /* 1 Mb */
#define DEFAULT_MIRROR_BUF_SIZE \
    (MAX_IN_FLIGHT * MAX_IO_SECTORS * BDRV_SECTOR_SIZE)
29 30 31 32 33 34 35

/* The mirroring buffer is a list of granularity-sized chunks.
 * Free chunks are organized in a list.
 */
typedef struct MirrorBuffer {
    QSIMPLEQ_ENTRY(MirrorBuffer) next;
} MirrorBuffer;
P
Paolo Bonzini 已提交
36 37 38 39

typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
K
Kevin Wolf 已提交
40
    BlockBackend *target;
41 42
    BlockDriverState *mirror_top_bs;
    BlockDriverState *source;
F
Fam Zheng 已提交
43
    BlockDriverState *base;
44

45 46 47 48 49 50
    /* The name of the graph node to replace */
    char *replaces;
    /* The BDS to replace */
    BlockDriverState *to_replace;
    /* Used to block operations on the drive-mirror-replace target */
    Error *replace_blocker;
F
Fam Zheng 已提交
51
    bool is_none_mode;
M
Max Reitz 已提交
52
    BlockMirrorBackingMode backing_mode;
53
    BlockdevOnError on_source_error, on_target_error;
P
Paolo Bonzini 已提交
54 55
    bool synced;
    bool should_complete;
56
    int64_t granularity;
57
    size_t buf_size;
M
Max Reitz 已提交
58
    int64_t bdev_length;
59
    unsigned long *cow_bitmap;
F
Fam Zheng 已提交
60
    BdrvDirtyBitmap *dirty_bitmap;
61
    BdrvDirtyBitmapIter *dbi;
P
Paolo Bonzini 已提交
62
    uint8_t *buf;
63 64
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;
65

66
    uint64_t last_pause_ns;
67
    unsigned long *in_flight_bitmap;
68
    int in_flight;
69
    int64_t sectors_in_flight;
70
    int ret;
71
    bool unmap;
K
Kevin Wolf 已提交
72
    bool waiting_for_io;
F
Fam Zheng 已提交
73 74
    int target_cluster_sectors;
    int max_iov;
75
    bool initial_zeroing_ongoing;
P
Paolo Bonzini 已提交
76 77
} MirrorBlockJob;

78 79 80 81 82 83 84
typedef struct MirrorOp {
    MirrorBlockJob *s;
    QEMUIOVector qiov;
    int64_t sector_num;
    int nb_sectors;
} MirrorOp;

85 86 87 88 89
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                            int error)
{
    s->synced = false;
    if (read) {
90 91
        return block_job_error_action(&s->common, s->on_source_error,
                                      true, error);
92
    } else {
93 94
        return block_job_error_action(&s->common, s->on_target_error,
                                      false, error);
95 96 97
    }
}

98 99 100
static void mirror_iteration_done(MirrorOp *op, int ret)
{
    MirrorBlockJob *s = op->s;
101
    struct iovec *iov;
102
    int64_t chunk_num;
103
    int i, nb_chunks, sectors_per_chunk;
104 105 106 107

    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
M
Max Reitz 已提交
108
    s->sectors_in_flight -= op->nb_sectors;
109 110 111 112 113 114 115
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
        s->buf_free_count++;
    }

116 117
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    chunk_num = op->sector_num / sectors_per_chunk;
118
    nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk);
119
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
M
Max Reitz 已提交
120 121 122 123
    if (ret >= 0) {
        if (s->cow_bitmap) {
            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
        }
124 125 126
        if (!s->initial_zeroing_ongoing) {
            s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
        }
127
    }
Z
Zhang Min 已提交
128
    qemu_iovec_destroy(&op->qiov);
129
    g_free(op);
130

K
Kevin Wolf 已提交
131
    if (s->waiting_for_io) {
132
        qemu_coroutine_enter(s->common.co);
133
    }
134 135 136 137 138 139
}

static void mirror_write_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
140 141

    aio_context_acquire(blk_get_aio_context(s->common.blk));
142 143 144
    if (ret < 0) {
        BlockErrorAction action;

145
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
146
        action = mirror_error_action(s, false, -ret);
W
Wenchao Xia 已提交
147
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
148 149 150 151
            s->ret = ret;
        }
    }
    mirror_iteration_done(op, ret);
152
    aio_context_release(blk_get_aio_context(s->common.blk));
153 154 155 156 157 158
}

static void mirror_read_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
159 160

    aio_context_acquire(blk_get_aio_context(s->common.blk));
161 162 163
    if (ret < 0) {
        BlockErrorAction action;

164
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
165
        action = mirror_error_action(s, true, -ret);
W
Wenchao Xia 已提交
166
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
167 168 169 170
            s->ret = ret;
        }

        mirror_iteration_done(op, ret);
171 172 173
    } else {
        blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
                        0, mirror_write_complete, op);
174
    }
175
    aio_context_release(blk_get_aio_context(s->common.blk));
176 177
}

178 179 180 181 182 183 184 185
static inline void mirror_clip_sectors(MirrorBlockJob *s,
                                       int64_t sector_num,
                                       int *nb_sectors)
{
    *nb_sectors = MIN(*nb_sectors,
                      s->bdev_length / BDRV_SECTOR_SIZE - sector_num);
}

F
Fam Zheng 已提交
186 187 188 189 190
/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
 * return the offset of the adjusted tail sector against original. */
static int mirror_cow_align(MirrorBlockJob *s,
                            int64_t *sector_num,
                            int *nb_sectors)
P
Paolo Bonzini 已提交
191
{
F
Fam Zheng 已提交
192 193 194 195 196 197 198 199 200 201 202
    bool need_cow;
    int ret = 0;
    int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS;
    int64_t align_sector_num = *sector_num;
    int align_nb_sectors = *nb_sectors;
    int max_sectors = chunk_sectors * s->max_iov;

    need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap);
    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                          s->cow_bitmap);
    if (need_cow) {
203 204 205
        bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
                                       *nb_sectors, &align_sector_num,
                                       &align_nb_sectors);
F
Fam Zheng 已提交
206
    }
207

F
Fam Zheng 已提交
208 209 210 211 212 213
    if (align_nb_sectors > max_sectors) {
        align_nb_sectors = max_sectors;
        if (need_cow) {
            align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors,
                                               s->target_cluster_sectors);
        }
214
    }
215 216 217
    /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but
     * that doesn't matter because it's already the end of source image. */
    mirror_clip_sectors(s, align_sector_num, &align_nb_sectors);
218

F
Fam Zheng 已提交
219 220 221 222 223 224 225
    ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
    *sector_num = align_sector_num;
    *nb_sectors = align_nb_sectors;
    assert(ret >= 0);
    return ret;
}

F
Fam Zheng 已提交
226 227 228 229 230 231 232 233
static inline void mirror_wait_for_io(MirrorBlockJob *s)
{
    assert(!s->waiting_for_io);
    s->waiting_for_io = true;
    qemu_coroutine_yield();
    s->waiting_for_io = false;
}

F
Fam Zheng 已提交
234
/* Submit async read while handling COW.
235 236 237
 * Returns: The number of sectors copied after and including sector_num,
 *          excluding any sectors copied prior to sector_num due to alignment.
 *          This will be nb_sectors if no alignment is necessary, or
F
Fam Zheng 已提交
238 239 240 241 242 243
 *          (new_end - sector_num) if tail is rounded up or down due to
 *          alignment or buffer limit.
 */
static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                          int nb_sectors)
{
K
Kevin Wolf 已提交
244
    BlockBackend *source = s->common.blk;
F
Fam Zheng 已提交
245
    int sectors_per_chunk, nb_chunks;
246
    int ret;
F
Fam Zheng 已提交
247
    MirrorOp *op;
248
    int max_sectors;
F
Fam Zheng 已提交
249

250
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
251
    max_sectors = sectors_per_chunk * s->max_iov;
252

F
Fam Zheng 已提交
253 254
    /* We can only handle as much as buf_size at a time. */
    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
255
    nb_sectors = MIN(max_sectors, nb_sectors);
F
Fam Zheng 已提交
256
    assert(nb_sectors);
257
    ret = nb_sectors;
258

F
Fam Zheng 已提交
259 260 261 262 263 264 265 266
    if (s->cow_bitmap) {
        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
    }
    assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size);
    /* The sector range must meet granularity because:
     * 1) Caller passes in aligned values;
     * 2) mirror_cow_align is used only when target cluster is larger. */
    assert(!(sector_num % sectors_per_chunk));
267
    nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk);
F
Fam Zheng 已提交
268 269

    while (s->buf_free_count < nb_chunks) {
270
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
F
Fam Zheng 已提交
271
        mirror_wait_for_io(s);
272 273
    }

274
    /* Allocate a MirrorOp that is used as an AIO callback.  */
275
    op = g_new(MirrorOp, 1);
276 277 278
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;
279 280 281 282 283 284 285

    /* Now make a QEMUIOVector taking enough granularity-sized chunks
     * from s->buf_free.
     */
    qemu_iovec_init(&op->qiov, nb_chunks);
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
F
Fam Zheng 已提交
286
        size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size;
287

288 289
        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
290
        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
291
    }
292

P
Paolo Bonzini 已提交
293
    /* Copy the dirty cluster.  */
294
    s->in_flight++;
M
Max Reitz 已提交
295
    s->sectors_in_flight += nb_sectors;
296
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
297

298
    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
F
Fam Zheng 已提交
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
                   mirror_read_complete, op);
    return ret;
}

static void mirror_do_zero_or_discard(MirrorBlockJob *s,
                                      int64_t sector_num,
                                      int nb_sectors,
                                      bool is_discard)
{
    MirrorOp *op;

    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
     * so the freeing in mirror_iteration_done is nop. */
    op = g_new0(MirrorOp, 1);
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;

    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    if (is_discard) {
320 321 322
        blk_aio_pdiscard(s->target, sector_num << BDRV_SECTOR_BITS,
                         op->nb_sectors << BDRV_SECTOR_BITS,
                         mirror_write_complete, op);
F
Fam Zheng 已提交
323
    } else {
K
Kevin Wolf 已提交
324 325
        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
                              op->nb_sectors * BDRV_SECTOR_SIZE,
326 327
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
F
Fam Zheng 已提交
328 329 330 331 332
    }
}

static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
{
333
    BlockDriverState *source = s->source;
334
    int64_t sector_num, first_chunk;
F
Fam Zheng 已提交
335 336 337 338 339
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
    int nb_chunks = 1;
    int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
    int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
340
    bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
341 342
    int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT,
                             MAX_IO_SECTORS);
F
Fam Zheng 已提交
343

344
    sector_num = bdrv_dirty_iter_next(s->dbi);
F
Fam Zheng 已提交
345
    if (sector_num < 0) {
346 347
        bdrv_set_dirty_iter(s->dbi, 0);
        sector_num = bdrv_dirty_iter_next(s->dbi);
F
Fam Zheng 已提交
348 349 350 351
        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
        assert(sector_num >= 0);
    }

352 353
    first_chunk = sector_num / sectors_per_chunk;
    while (test_bit(first_chunk, s->in_flight_bitmap)) {
354
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
355 356 357
        mirror_wait_for_io(s);
    }

358 359
    block_job_pause_point(&s->common);

F
Fam Zheng 已提交
360 361 362
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
363
        int64_t next_dirty;
F
Fam Zheng 已提交
364 365 366 367 368 369 370
        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
        int64_t next_chunk = next_sector / sectors_per_chunk;
        if (next_sector >= end ||
            !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
            break;
        }
        if (test_bit(next_chunk, s->in_flight_bitmap)) {
371
            break;
F
Fam Zheng 已提交
372
        }
373

374 375
        next_dirty = bdrv_dirty_iter_next(s->dbi);
        if (next_dirty > next_sector || next_dirty < 0) {
376
            /* The bitmap iterator's cache is stale, refresh it */
377 378
            bdrv_set_dirty_iter(s->dbi, next_sector);
            next_dirty = bdrv_dirty_iter_next(s->dbi);
379
        }
380
        assert(next_dirty == next_sector);
381
        nb_chunks++;
F
Fam Zheng 已提交
382 383 384 385 386 387 388 389 390 391
    }

    /* Clear dirty bits before querying the block status, because
     * calling bdrv_get_block_status_above could yield - if some blocks are
     * marked dirty in this window, we need to know.
     */
    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num,
                            nb_chunks * sectors_per_chunk);
    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
    while (nb_chunks > 0 && sector_num < end) {
392
        int64_t ret;
393
        int io_sectors, io_sectors_acct;
F
Fam Zheng 已提交
394 395 396 397 398 399 400 401 402 403 404 405
        BlockDriverState *file;
        enum MirrorMethod {
            MIRROR_METHOD_COPY,
            MIRROR_METHOD_ZERO,
            MIRROR_METHOD_DISCARD
        } mirror_method = MIRROR_METHOD_COPY;

        assert(!(sector_num % sectors_per_chunk));
        ret = bdrv_get_block_status_above(source, NULL, sector_num,
                                          nb_chunks * sectors_per_chunk,
                                          &io_sectors, &file);
        if (ret < 0) {
406 407 408
            io_sectors = MIN(nb_chunks * sectors_per_chunk, max_io_sectors);
        } else if (ret & BDRV_BLOCK_DATA) {
            io_sectors = MIN(io_sectors, max_io_sectors);
F
Fam Zheng 已提交
409 410 411 412 413 414 415 416
        }

        io_sectors -= io_sectors % sectors_per_chunk;
        if (io_sectors < sectors_per_chunk) {
            io_sectors = sectors_per_chunk;
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_sector_num;
            int target_nb_sectors;
417 418 419
            bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
                                           io_sectors,  &target_sector_num,
                                           &target_nb_sectors);
F
Fam Zheng 已提交
420 421 422 423 424 425 426 427
            if (target_sector_num == sector_num &&
                target_nb_sectors == io_sectors) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
                                    MIRROR_METHOD_ZERO :
                                    MIRROR_METHOD_DISCARD;
            }
        }

428 429 430 431 432
        while (s->in_flight >= MAX_IN_FLIGHT) {
            trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
            mirror_wait_for_io(s);
        }

433 434 435 436
        if (s->ret < 0) {
            return 0;
        }

437
        mirror_clip_sectors(s, sector_num, &io_sectors);
F
Fam Zheng 已提交
438 439 440
        switch (mirror_method) {
        case MIRROR_METHOD_COPY:
            io_sectors = mirror_do_read(s, sector_num, io_sectors);
441
            io_sectors_acct = io_sectors;
F
Fam Zheng 已提交
442 443 444
            break;
        case MIRROR_METHOD_ZERO:
        case MIRROR_METHOD_DISCARD:
445 446 447 448 449 450 451
            mirror_do_zero_or_discard(s, sector_num, io_sectors,
                                      mirror_method == MIRROR_METHOD_DISCARD);
            if (write_zeroes_ok) {
                io_sectors_acct = 0;
            } else {
                io_sectors_acct = io_sectors;
            }
F
Fam Zheng 已提交
452 453 454 455 456 457
            break;
        default:
            abort();
        }
        assert(io_sectors);
        sector_num += io_sectors;
458
        nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
459
        if (s->common.speed) {
460
            delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors_acct);
461
        }
462
    }
463
    return delay_ns;
464
}
465

466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
static void mirror_free_init(MirrorBlockJob *s)
{
    int granularity = s->granularity;
    size_t buf_size = s->buf_size;
    uint8_t *buf = s->buf;

    assert(s->buf_free_count == 0);
    QSIMPLEQ_INIT(&s->buf_free);
    while (buf_size != 0) {
        MirrorBuffer *cur = (MirrorBuffer *)buf;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
        s->buf_free_count++;
        buf_size -= granularity;
        buf += granularity;
    }
}

483 484 485 486 487
/* This is also used for the .pause callback. There is no matching
 * mirror_resume() because mirror_run() will begin iterating again
 * when the job is resumed.
 */
static void mirror_wait_for_all_io(MirrorBlockJob *s)
488 489
{
    while (s->in_flight > 0) {
F
Fam Zheng 已提交
490
        mirror_wait_for_io(s);
491
    }
P
Paolo Bonzini 已提交
492 493
}

494 495 496 497 498 499 500 501 502
typedef struct {
    int ret;
} MirrorExitData;

static void mirror_exit(BlockJob *job, void *opaque)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
503
    BlockDriverState *src = s->source;
K
Kevin Wolf 已提交
504
    BlockDriverState *target_bs = blk_bs(s->target);
505
    BlockDriverState *mirror_top_bs = s->mirror_top_bs;
506
    Error *local_err = NULL;
507 508 509 510

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
    bdrv_ref(src);
511 512 513 514 515 516 517 518 519
    bdrv_ref(mirror_top_bs);

    /* We don't access the source any more. Dropping any WRITE/RESIZE is
     * required before it could become a backing file of target_bs. */
    bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL,
                            &error_abort);
    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
        BlockDriverState *backing = s->is_none_mode ? src : s->base;
        if (backing_bs(target_bs) != backing) {
520 521 522 523 524
            bdrv_set_backing_hd(target_bs, backing, &local_err);
            if (local_err) {
                error_report_err(local_err);
                data->ret = -EPERM;
            }
525 526
        }
    }
527 528 529 530 531 532 533

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);
    }

    if (s->should_complete && data->ret == 0) {
K
Kevin Wolf 已提交
534
        BlockDriverState *to_replace = src;
535 536 537
        if (s->to_replace) {
            to_replace = s->to_replace;
        }
538

K
Kevin Wolf 已提交
539 540
        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
541
        }
542 543 544

        /* The mirror job has no requests in flight any more, but we need to
         * drain potential other users of the BDS before changing the graph. */
K
Kevin Wolf 已提交
545 546 547
        bdrv_drained_begin(target_bs);
        bdrv_replace_in_backing_chain(to_replace, target_bs);
        bdrv_drained_end(target_bs);
548 549 550 551 552 553 554 555 556 557
    }
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
        bdrv_unref(s->to_replace);
    }
    if (replace_aio_context) {
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
K
Kevin Wolf 已提交
558
    blk_unref(s->target);
559
    s->target = NULL;
560 561 562 563 564 565 566 567 568 569 570 571 572 573 574

    /* Remove the mirror filter driver from the graph. Before this, get rid of
     * the blockers on the intermediate nodes so that the resulting state is
     * valid. */
    block_job_remove_all_bdrv(job);
    bdrv_replace_in_backing_chain(mirror_top_bs, backing_bs(mirror_top_bs));

    /* We just changed the BDS the job BB refers to (with either or both of the
     * bdrv_replace_in_backing_chain() calls), so switch the BB back so the
     * cleanup does the right thing. We don't need any permissions any more
     * now. */
    blk_remove_bs(job->blk);
    blk_set_perm(job->blk, 0, BLK_PERM_ALL, &error_abort);
    blk_insert_bs(job->blk, mirror_top_bs, &error_abort);

575
    block_job_completed(&s->common, data->ret);
576

577
    g_free(data);
578
    bdrv_drained_end(src);
579
    bdrv_unref(mirror_top_bs);
580
    bdrv_unref(src);
581 582
}

583 584 585 586 587 588 589 590 591 592 593 594
static void mirror_throttle(MirrorBlockJob *s)
{
    int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);

    if (now - s->last_pause_ns > SLICE_TIME) {
        s->last_pause_ns = now;
        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
    } else {
        block_job_pause_point(&s->common);
    }
}

595 596 597 598
static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
{
    int64_t sector_num, end;
    BlockDriverState *base = s->base;
599
    BlockDriverState *bs = s->source;
600 601 602 603 604
    BlockDriverState *target_bs = blk_bs(s->target);
    int ret, n;

    end = s->bdev_length / BDRV_SECTOR_SIZE;

605
    if (base == NULL && !bdrv_has_zero_init(target_bs)) {
606 607 608 609 610
        if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
            bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, end);
            return 0;
        }

611
        s->initial_zeroing_ongoing = true;
612 613 614 615 616 617 618
        for (sector_num = 0; sector_num < end; ) {
            int nb_sectors = MIN(end - sector_num,
                QEMU_ALIGN_DOWN(INT_MAX, s->granularity) >> BDRV_SECTOR_BITS);

            mirror_throttle(s);

            if (block_job_is_cancelled(&s->common)) {
619
                s->initial_zeroing_ongoing = false;
620 621 622 623 624 625 626 627 628 629 630 631 632
                return 0;
            }

            if (s->in_flight >= MAX_IN_FLIGHT) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, -1);
                mirror_wait_for_io(s);
                continue;
            }

            mirror_do_zero_or_discard(s, sector_num, nb_sectors, false);
            sector_num += nb_sectors;
        }

633
        mirror_wait_for_all_io(s);
634
        s->initial_zeroing_ongoing = false;
635 636
    }

637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654
    /* First part, loop on the sectors and initialize the dirty bitmap.  */
    for (sector_num = 0; sector_num < end; ) {
        /* Just to make sure we are not exceeding int limit. */
        int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
                             end - sector_num);

        mirror_throttle(s);

        if (block_job_is_cancelled(&s->common)) {
            return 0;
        }

        ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
        if (ret < 0) {
            return ret;
        }

        assert(n > 0);
655
        if (ret == 1) {
656 657 658 659 660 661 662
            bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
        }
        sector_num += n;
    }
    return 0;
}

663 664 665 666 667 668 669 670 671 672 673 674 675 676
/* Called when going out of the streaming phase to flush the bulk of the
 * data to the medium, or just before completing.
 */
static int mirror_flush(MirrorBlockJob *s)
{
    int ret = blk_flush(s->target);
    if (ret < 0) {
        if (mirror_error_action(s, false, -ret) == BLOCK_ERROR_ACTION_REPORT) {
            s->ret = ret;
        }
    }
    return ret;
}

P
Paolo Bonzini 已提交
677 678 679
static void coroutine_fn mirror_run(void *opaque)
{
    MirrorBlockJob *s = opaque;
680
    MirrorExitData *data;
681
    BlockDriverState *bs = s->source;
K
Kevin Wolf 已提交
682
    BlockDriverState *target_bs = blk_bs(s->target);
683
    bool need_drain = true;
684
    int64_t length;
685
    BlockDriverInfo bdi;
686 687
    char backing_filename[2]; /* we only need 2 characters because we are only
                                 checking for a NULL string */
P
Paolo Bonzini 已提交
688
    int ret = 0;
F
Fam Zheng 已提交
689
    int target_cluster_size = BDRV_SECTOR_SIZE;
P
Paolo Bonzini 已提交
690 691 692 693 694

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
    }

M
Max Reitz 已提交
695 696 697
    s->bdev_length = bdrv_getlength(bs);
    if (s->bdev_length < 0) {
        ret = s->bdev_length;
698
        goto immediate_exit;
699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
    }

    /* Active commit must resize the base image if its size differs from the
     * active layer. */
    if (s->base == blk_bs(s->target)) {
        int64_t base_length;

        base_length = blk_getlength(s->target);
        if (base_length < 0) {
            ret = base_length;
            goto immediate_exit;
        }

        if (s->bdev_length > base_length) {
            ret = blk_truncate(s->target, s->bdev_length);
            if (ret < 0) {
                goto immediate_exit;
            }
        }
    }

    if (s->bdev_length == 0) {
721 722 723 724 725 726 727 728
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
        while (!block_job_is_cancelled(&s->common) && !s->should_complete) {
            block_job_yield(&s->common);
        }
        s->common.cancelled = false;
        goto immediate_exit;
P
Paolo Bonzini 已提交
729 730
    }

M
Max Reitz 已提交
731
    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
732 733
    s->in_flight_bitmap = bitmap_new(length);

734 735 736 737
    /* If we have no backing file yet in the destination, we cannot let
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
K
Kevin Wolf 已提交
738
    bdrv_get_backing_filename(target_bs, backing_filename,
739
                              sizeof(backing_filename));
K
Kevin Wolf 已提交
740
    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
F
Fam Zheng 已提交
741 742
        target_cluster_size = bdi.cluster_size;
    }
K
Kevin Wolf 已提交
743
    if (backing_filename[0] && !target_bs->backing
F
Fam Zheng 已提交
744 745 746
        && s->granularity < target_cluster_size) {
        s->buf_size = MAX(s->buf_size, target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
747
    }
F
Fam Zheng 已提交
748
    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
K
Kevin Wolf 已提交
749
    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
750

751 752 753 754 755 756
    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
        ret = -ENOMEM;
        goto immediate_exit;
    }

757
    mirror_free_init(s);
P
Paolo Bonzini 已提交
758

759
    s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
F
Fam Zheng 已提交
760
    if (!s->is_none_mode) {
761 762 763
        ret = mirror_dirty_init(s);
        if (ret < 0 || block_job_is_cancelled(&s->common)) {
            goto immediate_exit;
P
Paolo Bonzini 已提交
764 765 766
        }
    }

767 768
    assert(!s->dbi);
    s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap, 0);
P
Paolo Bonzini 已提交
769
    for (;;) {
770
        uint64_t delay_ns = 0;
771
        int64_t cnt, delta;
P
Paolo Bonzini 已提交
772 773
        bool should_complete;

774 775 776 777 778
        if (s->ret < 0) {
            ret = s->ret;
            goto immediate_exit;
        }

779 780
        block_job_pause_point(&s->common);

781
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
M
Max Reitz 已提交
782 783 784 785 786 787
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
         * s->sectors_in_flight is the number of sectors currently being
         * processed; together those are the current total operation length */
        s->common.len = s->common.offset +
                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
788 789

        /* Note that even when no rate limit is applied we need to yield
790
         * periodically with no pending I/O so that bdrv_drain_all() returns.
791 792 793
         * We do so every SLICE_TIME nanoseconds, or when there is an error,
         * or when the source is clean, whichever comes first.
         */
794 795
        delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
        if (delta < SLICE_TIME &&
796
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
797
            if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
798 799
                (cnt == 0 && s->in_flight > 0)) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
F
Fam Zheng 已提交
800
                mirror_wait_for_io(s);
801 802
                continue;
            } else if (cnt != 0) {
803
                delay_ns = mirror_iteration(s);
P
Paolo Bonzini 已提交
804 805 806 807
            }
        }

        should_complete = false;
808
        if (s->in_flight == 0 && cnt == 0) {
P
Paolo Bonzini 已提交
809
            trace_mirror_before_flush(s);
810 811 812 813
            if (!s->synced) {
                if (mirror_flush(s) < 0) {
                    /* Go check s->ret.  */
                    continue;
814 815 816 817 818 819
                }
                /* We're out of the streaming phase.  From now on, if the job
                 * is cancelled we will actually complete all pending I/O and
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
820 821
                block_job_event_ready(&s->common);
                s->synced = true;
P
Paolo Bonzini 已提交
822
            }
823 824 825 826

            should_complete = s->should_complete ||
                block_job_is_cancelled(&s->common);
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
827 828 829 830 831 832 833 834 835
        }

        if (cnt == 0 && should_complete) {
            /* The dirty bitmap is not updated while operations are pending.
             * If we're about to exit, wait for pending operations before
             * calling bdrv_get_dirty_count(bs), or we may exit while the
             * source has dirty data to copy!
             *
             * Note that I/O can be submitted by the guest while
836 837 838
             * mirror_populate runs, so pause it now.  Before deciding
             * whether to switch to target check one last time if I/O has
             * come in the meanwhile, and if not flush the data to disk.
P
Paolo Bonzini 已提交
839 840
             */
            trace_mirror_before_drain(s, cnt);
841 842

            bdrv_drained_begin(bs);
843
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
844
            if (cnt > 0 || mirror_flush(s) < 0) {
845 846 847 848 849 850 851 852 853 854 855
                bdrv_drained_end(bs);
                continue;
            }

            /* The two disks are in sync.  Exit and report successful
             * completion.
             */
            assert(QLIST_EMPTY(&bs->tracked_requests));
            s->common.cancelled = false;
            need_drain = false;
            break;
P
Paolo Bonzini 已提交
856 857 858
        }

        ret = 0;
859
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
P
Paolo Bonzini 已提交
860
        if (!s->synced) {
861
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
862 863 864 865
            if (block_job_is_cancelled(&s->common)) {
                break;
            }
        } else if (!should_complete) {
866
            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
867
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
868
        }
869
        s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
P
Paolo Bonzini 已提交
870 871 872
    }

immediate_exit:
873 874 875 876 877 878
    if (s->in_flight > 0) {
        /* We get here only if something went wrong.  Either the job failed,
         * or it was cancelled prematurely so that we do not guarantee that
         * the target is a copy of the source.
         */
        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
879
        assert(need_drain);
880
        mirror_wait_for_all_io(s);
881 882 883
    }

    assert(s->in_flight == 0);
884
    qemu_vfree(s->buf);
885
    g_free(s->cow_bitmap);
886
    g_free(s->in_flight_bitmap);
887
    bdrv_dirty_iter_free(s->dbi);
F
Fam Zheng 已提交
888
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
889 890 891

    data = g_malloc(sizeof(*data));
    data->ret = ret;
892 893 894 895

    if (need_drain) {
        bdrv_drained_begin(bs);
    }
896
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
P
Paolo Bonzini 已提交
897 898 899 900 901 902 903
}

static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    if (speed < 0) {
904
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
P
Paolo Bonzini 已提交
905 906 907 908 909
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

P
Paolo Bonzini 已提交
910 911 912
static void mirror_complete(BlockJob *job, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
913
    BlockDriverState *target;
M
Max Reitz 已提交
914 915

    target = blk_bs(s->target);
P
Paolo Bonzini 已提交
916 917

    if (!s->synced) {
918 919
        error_setg(errp, "The active block job '%s' cannot be completed",
                   job->id);
P
Paolo Bonzini 已提交
920 921 922
        return;
    }

M
Max Reitz 已提交
923 924 925 926 927 928 929 930 931 932
    if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
        int ret;

        assert(!target->backing);
        ret = bdrv_open_backing_file(target, NULL, "backing", errp);
        if (ret < 0) {
            return;
        }
    }

C
Changlong Xie 已提交
933
    /* block all operations on to_replace bs */
934
    if (s->replaces) {
935 936
        AioContext *replace_aio_context;

937
        s->to_replace = bdrv_find_node(s->replaces);
938
        if (!s->to_replace) {
939
            error_setg(errp, "Node name '%s' not found", s->replaces);
940 941 942
            return;
        }

943 944 945
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);

946 947 948 949
        /* TODO Translate this into permission system. Current definition of
         * GRAPH_MOD would require to request it for the parents; they might
         * not even be BlockDriverStates, however, so a BdrvChild can't address
         * them. May need redefinition of GRAPH_MOD. */
950 951 952 953
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
954 955

        aio_context_release(replace_aio_context);
956 957
    }

P
Paolo Bonzini 已提交
958
    s->should_complete = true;
F
Fam Zheng 已提交
959
    block_job_enter(&s->common);
P
Paolo Bonzini 已提交
960 961
}

962
static void mirror_pause(BlockJob *job)
963 964 965
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

966
    mirror_wait_for_all_io(s);
967 968 969 970 971 972 973 974 975
}

static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    blk_set_aio_context(s->target, new_context);
}

976 977 978 979 980 981 982 983 984 985 986 987 988 989 990
static void mirror_drain(BlockJob *job)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    /* Need to keep a reference in case blk_drain triggers execution
     * of mirror_complete...
     */
    if (s->target) {
        BlockBackend *target = s->target;
        blk_ref(target);
        blk_drain(target);
        blk_unref(target);
    }
}

991
static const BlockJobDriver mirror_job_driver = {
992 993 994
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_MIRROR,
    .set_speed              = mirror_set_speed,
J
John Snow 已提交
995
    .start                  = mirror_run,
996 997 998
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
999
    .drain                  = mirror_drain,
P
Paolo Bonzini 已提交
1000 1001
};

F
Fam Zheng 已提交
1002
static const BlockJobDriver commit_active_job_driver = {
1003 1004 1005
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_COMMIT,
    .set_speed              = mirror_set_speed,
J
John Snow 已提交
1006
    .start                  = mirror_run,
1007 1008 1009
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
1010
    .drain                  = mirror_drain,
F
Fam Zheng 已提交
1011 1012
};

1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083
static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
{
    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
}

static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
{
    return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
}

static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
{
    return bdrv_co_flush(bs->backing->bs);
}

static int64_t coroutine_fn bdrv_mirror_top_get_block_status(
    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
    BlockDriverState **file)
{
    *pnum = nb_sectors;
    *file = bs->backing->bs;
    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
           (sector_num << BDRV_SECTOR_BITS);
}

static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
    int64_t offset, int count, BdrvRequestFlags flags)
{
    return bdrv_co_pwrite_zeroes(bs->backing, offset, count, flags);
}

static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,
    int64_t offset, int count)
{
    return bdrv_co_pdiscard(bs->backing->bs, offset, count);
}

static void bdrv_mirror_top_close(BlockDriverState *bs)
{
}

static void bdrv_mirror_top_child_perm(BlockDriverState *bs, BdrvChild *c,
                                       const BdrvChildRole *role,
                                       uint64_t perm, uint64_t shared,
                                       uint64_t *nperm, uint64_t *nshared)
{
    /* Must be able to forward guest writes to the real image */
    *nperm = 0;
    if (perm & BLK_PERM_WRITE) {
        *nperm |= BLK_PERM_WRITE;
    }

    *nshared = BLK_PERM_ALL;
}

/* Dummy node that provides consistent read to its users without requiring it
 * from its backing file and that allows writes on the backing file chain. */
static BlockDriver bdrv_mirror_top = {
    .format_name                = "mirror_top",
    .bdrv_co_preadv             = bdrv_mirror_top_preadv,
    .bdrv_co_pwritev            = bdrv_mirror_top_pwritev,
    .bdrv_co_pwrite_zeroes      = bdrv_mirror_top_pwrite_zeroes,
    .bdrv_co_pdiscard           = bdrv_mirror_top_pdiscard,
    .bdrv_co_flush              = bdrv_mirror_top_flush,
    .bdrv_co_get_block_status   = bdrv_mirror_top_get_block_status,
    .bdrv_close                 = bdrv_mirror_top_close,
    .bdrv_child_perm            = bdrv_mirror_top_child_perm,
};

1084
static void mirror_start_job(const char *job_id, BlockDriverState *bs,
1085 1086 1087
                             int creation_flags, BlockDriverState *target,
                             const char *replaces, int64_t speed,
                             uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
1088
                             BlockMirrorBackingMode backing_mode,
1089 1090
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
1091
                             bool unmap,
1092
                             BlockCompletionFunc *cb,
1093 1094
                             void *opaque, Error **errp,
                             const BlockJobDriver *driver,
1095
                             bool is_none_mode, BlockDriverState *base,
1096
                             bool auto_complete, const char *filter_node_name)
P
Paolo Bonzini 已提交
1097 1098
{
    MirrorBlockJob *s;
1099 1100 1101
    BlockDriverState *mirror_top_bs;
    bool target_graph_mod;
    bool target_is_backing;
1102
    Error *local_err = NULL;
1103
    int ret;
P
Paolo Bonzini 已提交
1104

1105
    if (granularity == 0) {
1106
        granularity = bdrv_get_default_bitmap_granularity(target);
1107 1108 1109 1110
    }

    assert ((granularity & (granularity - 1)) == 0);

W
Wen Congyang 已提交
1111 1112 1113 1114 1115 1116 1117 1118
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
    }

    if (buf_size == 0) {
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }
F
Fam Zheng 已提交
1119

1120 1121 1122
    /* In the case of active commit, add dummy driver to provide consistent
     * reads on the top, while disabling it in the intermediate nodes, and make
     * the backing chain writable. */
1123 1124
    mirror_top_bs = bdrv_new_open_driver(&bdrv_mirror_top, filter_node_name,
                                         BDRV_O_RDWR, errp);
1125 1126 1127 1128 1129 1130 1131 1132 1133
    if (mirror_top_bs == NULL) {
        return;
    }
    mirror_top_bs->total_sectors = bs->total_sectors;

    /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
     * it alive until block_job_create() even if bs has no parent. */
    bdrv_ref(mirror_top_bs);
    bdrv_drained_begin(bs);
1134
    bdrv_append(mirror_top_bs, bs, &local_err);
1135 1136
    bdrv_drained_end(bs);

1137 1138 1139 1140 1141 1142
    if (local_err) {
        bdrv_unref(mirror_top_bs);
        error_propagate(errp, local_err);
        return;
    }

1143 1144 1145 1146 1147
    /* Make sure that the source is not resized while the job is running */
    s = block_job_create(job_id, driver, mirror_top_bs,
                         BLK_PERM_CONSISTENT_READ,
                         BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
                         BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD, speed,
1148
                         creation_flags, cb, opaque, errp);
1149
    bdrv_unref(mirror_top_bs);
P
Paolo Bonzini 已提交
1150
    if (!s) {
1151
        goto fail;
P
Paolo Bonzini 已提交
1152
    }
1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171
    s->source = bs;
    s->mirror_top_bs = mirror_top_bs;

    /* No resize for the target either; while the mirror is still running, a
     * consistent read isn't necessarily possible. We could possibly allow
     * writes and graph modifications, though it would likely defeat the
     * purpose of a mirror, so leave them blocked for now.
     *
     * In the case of active commit, things look a bit different, though,
     * because the target is an already populated backing file in active use.
     * We can allow anything except resize there.*/
    target_is_backing = bdrv_chain_contains(bs, target);
    target_graph_mod = (backing_mode != MIRROR_LEAVE_BACKING_CHAIN);
    s->target = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE |
                        (target_graph_mod ? BLK_PERM_GRAPH_MOD : 0),
                        BLK_PERM_WRITE_UNCHANGED |
                        (target_is_backing ? BLK_PERM_CONSISTENT_READ |
                                             BLK_PERM_WRITE |
                                             BLK_PERM_GRAPH_MOD : 0));
1172 1173
    ret = blk_insert_bs(s->target, target, errp);
    if (ret < 0) {
1174
        goto fail;
1175
    }
K
Kevin Wolf 已提交
1176

1177
    s->replaces = g_strdup(replaces);
1178 1179
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
F
Fam Zheng 已提交
1180
    s->is_none_mode = is_none_mode;
M
Max Reitz 已提交
1181
    s->backing_mode = backing_mode;
F
Fam Zheng 已提交
1182
    s->base = base;
1183
    s->granularity = granularity;
W
Wen Congyang 已提交
1184
    s->buf_size = ROUND_UP(buf_size, granularity);
1185
    s->unmap = unmap;
1186 1187 1188
    if (auto_complete) {
        s->should_complete = true;
    }
1189

1190
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
1191
    if (!s->dirty_bitmap) {
1192
        g_free(s->replaces);
K
Kevin Wolf 已提交
1193
        blk_unref(s->target);
1194
        block_job_unref(&s->common);
1195 1196
        return;
    }
1197

1198
    /* Required permissions are already taken with blk_new() */
1199 1200 1201
    block_job_add_bdrv(&s->common, "target", target, 0, BLK_PERM_ALL,
                       &error_abort);

1202 1203
    /* In commit_active_start() all intermediate nodes disappear, so
     * any jobs in them must be blocked */
1204
    if (target_is_backing) {
1205 1206
        BlockDriverState *iter;
        for (iter = backing_bs(bs); iter != target; iter = backing_bs(iter)) {
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216
            /* XXX BLK_PERM_WRITE needs to be allowed so we don't block
             * ourselves at s->base (if writes are blocked for a node, they are
             * also blocked for its backing file). The other options would be a
             * second filter driver above s->base (== target). */
            ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
                                     BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
                                     errp);
            if (ret < 0) {
                goto fail;
            }
1217 1218
        }
    }
1219

J
John Snow 已提交
1220 1221
    trace_mirror_start(bs, s, opaque);
    block_job_start(&s->common);
1222 1223 1224 1225 1226 1227 1228 1229 1230 1231
    return;

fail:
    if (s) {
        g_free(s->replaces);
        blk_unref(s->target);
        block_job_unref(&s->common);
    }

    bdrv_replace_in_backing_chain(mirror_top_bs, backing_bs(mirror_top_bs));
P
Paolo Bonzini 已提交
1232
}
F
Fam Zheng 已提交
1233

1234 1235
void mirror_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *target, const char *replaces,
1236
                  int64_t speed, uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
1237 1238
                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                  BlockdevOnError on_source_error,
F
Fam Zheng 已提交
1239
                  BlockdevOnError on_target_error,
1240
                  bool unmap, const char *filter_node_name, Error **errp)
F
Fam Zheng 已提交
1241 1242 1243 1244
{
    bool is_none_mode;
    BlockDriverState *base;

1245 1246
    if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        error_setg(errp, "Sync mode 'incremental' not supported");
1247 1248
        return;
    }
F
Fam Zheng 已提交
1249
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
1250
    base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
1251
    mirror_start_job(job_id, bs, BLOCK_JOB_DEFAULT, target, replaces,
M
Max Reitz 已提交
1252
                     speed, granularity, buf_size, backing_mode,
1253
                     on_source_error, on_target_error, unmap, NULL, NULL, errp,
1254 1255
                     &mirror_job_driver, is_none_mode, base, false,
                     filter_node_name);
F
Fam Zheng 已提交
1256 1257
}

1258
void commit_active_start(const char *job_id, BlockDriverState *bs,
1259 1260
                         BlockDriverState *base, int creation_flags,
                         int64_t speed, BlockdevOnError on_error,
1261
                         const char *filter_node_name,
1262
                         BlockCompletionFunc *cb, void *opaque, Error **errp,
1263
                         bool auto_complete)
F
Fam Zheng 已提交
1264
{
1265
    int orig_base_flags;
1266
    Error *local_err = NULL;
1267 1268 1269

    orig_base_flags = bdrv_get_flags(base);

F
Fam Zheng 已提交
1270 1271 1272
    if (bdrv_reopen(base, bs->open_flags, errp)) {
        return;
    }
1273

1274
    mirror_start_job(job_id, bs, creation_flags, base, NULL, speed, 0, 0,
1275
                     MIRROR_LEAVE_BACKING_CHAIN,
1276
                     on_error, on_error, true, cb, opaque, &local_err,
1277
                     &commit_active_job_driver, false, base, auto_complete,
1278
                     filter_node_name);
1279
    if (local_err) {
1280
        error_propagate(errp, local_err);
1281 1282 1283 1284 1285 1286 1287 1288 1289 1290
        goto error_restore_flags;
    }

    return;

error_restore_flags:
    /* ignore error and errp for bdrv_reopen, because we want to propagate
     * the original error */
    bdrv_reopen(base, orig_base_flags, NULL);
    return;
F
Fam Zheng 已提交
1291
}