mirror.c 45.1 KB
Newer Older
P
Paolo Bonzini 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Image mirroring
 *
 * Copyright Red Hat, Inc. 2012
 *
 * Authors:
 *  Paolo Bonzini  <pbonzini@redhat.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
14
#include "qemu/osdep.h"
15
#include "qemu/cutils.h"
P
Paolo Bonzini 已提交
16
#include "trace.h"
17
#include "block/blockjob_int.h"
18
#include "block/block_int.h"
19
#include "sysemu/block-backend.h"
20
#include "qapi/error.h"
21
#include "qapi/qmp/qerror.h"
P
Paolo Bonzini 已提交
22
#include "qemu/ratelimit.h"
23
#include "qemu/bitmap.h"
P
Paolo Bonzini 已提交
24

25 26
#define SLICE_TIME    100000000ULL /* ns */
#define MAX_IN_FLIGHT 16
27 28 29
#define MAX_IO_SECTORS ((1 << 20) >> BDRV_SECTOR_BITS) /* 1 Mb */
#define DEFAULT_MIRROR_BUF_SIZE \
    (MAX_IN_FLIGHT * MAX_IO_SECTORS * BDRV_SECTOR_SIZE)
30 31 32 33 34 35 36

/* The mirroring buffer is a list of granularity-sized chunks.
 * Free chunks are organized in a list.
 */
typedef struct MirrorBuffer {
    QSIMPLEQ_ENTRY(MirrorBuffer) next;
} MirrorBuffer;
P
Paolo Bonzini 已提交
37 38 39 40

typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
K
Kevin Wolf 已提交
41
    BlockBackend *target;
42 43
    BlockDriverState *mirror_top_bs;
    BlockDriverState *source;
F
Fam Zheng 已提交
44
    BlockDriverState *base;
45

46 47 48 49 50 51
    /* The name of the graph node to replace */
    char *replaces;
    /* The BDS to replace */
    BlockDriverState *to_replace;
    /* Used to block operations on the drive-mirror-replace target */
    Error *replace_blocker;
F
Fam Zheng 已提交
52
    bool is_none_mode;
M
Max Reitz 已提交
53
    BlockMirrorBackingMode backing_mode;
54
    BlockdevOnError on_source_error, on_target_error;
P
Paolo Bonzini 已提交
55 56
    bool synced;
    bool should_complete;
57
    int64_t granularity;
58
    size_t buf_size;
M
Max Reitz 已提交
59
    int64_t bdev_length;
60
    unsigned long *cow_bitmap;
F
Fam Zheng 已提交
61
    BdrvDirtyBitmap *dirty_bitmap;
62
    BdrvDirtyBitmapIter *dbi;
P
Paolo Bonzini 已提交
63
    uint8_t *buf;
64 65
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;
66

67
    uint64_t last_pause_ns;
68
    unsigned long *in_flight_bitmap;
69
    int in_flight;
70
    int64_t sectors_in_flight;
71
    int ret;
72
    bool unmap;
K
Kevin Wolf 已提交
73
    bool waiting_for_io;
F
Fam Zheng 已提交
74 75
    int target_cluster_sectors;
    int max_iov;
76
    bool initial_zeroing_ongoing;
P
Paolo Bonzini 已提交
77 78
} MirrorBlockJob;

79 80 81 82 83 84 85
typedef struct MirrorOp {
    MirrorBlockJob *s;
    QEMUIOVector qiov;
    int64_t sector_num;
    int nb_sectors;
} MirrorOp;

86 87 88 89 90
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                            int error)
{
    s->synced = false;
    if (read) {
91 92
        return block_job_error_action(&s->common, s->on_source_error,
                                      true, error);
93
    } else {
94 95
        return block_job_error_action(&s->common, s->on_target_error,
                                      false, error);
96 97 98
    }
}

99 100 101
static void mirror_iteration_done(MirrorOp *op, int ret)
{
    MirrorBlockJob *s = op->s;
102
    struct iovec *iov;
103
    int64_t chunk_num;
104
    int i, nb_chunks, sectors_per_chunk;
105 106 107 108

    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
M
Max Reitz 已提交
109
    s->sectors_in_flight -= op->nb_sectors;
110 111 112 113 114 115 116
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
        s->buf_free_count++;
    }

117 118
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    chunk_num = op->sector_num / sectors_per_chunk;
119
    nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk);
120
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
M
Max Reitz 已提交
121 122 123 124
    if (ret >= 0) {
        if (s->cow_bitmap) {
            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
        }
125 126 127
        if (!s->initial_zeroing_ongoing) {
            s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
        }
128
    }
Z
Zhang Min 已提交
129
    qemu_iovec_destroy(&op->qiov);
130
    g_free(op);
131

K
Kevin Wolf 已提交
132
    if (s->waiting_for_io) {
133
        qemu_coroutine_enter(s->common.co);
134
    }
135 136 137 138 139 140
}

static void mirror_write_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
141 142

    aio_context_acquire(blk_get_aio_context(s->common.blk));
143 144 145
    if (ret < 0) {
        BlockErrorAction action;

146
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
147
        action = mirror_error_action(s, false, -ret);
W
Wenchao Xia 已提交
148
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
149 150 151 152
            s->ret = ret;
        }
    }
    mirror_iteration_done(op, ret);
153
    aio_context_release(blk_get_aio_context(s->common.blk));
154 155 156 157 158 159
}

static void mirror_read_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
160 161

    aio_context_acquire(blk_get_aio_context(s->common.blk));
162 163 164
    if (ret < 0) {
        BlockErrorAction action;

165
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
166
        action = mirror_error_action(s, true, -ret);
W
Wenchao Xia 已提交
167
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
168 169 170 171
            s->ret = ret;
        }

        mirror_iteration_done(op, ret);
172 173 174
    } else {
        blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
                        0, mirror_write_complete, op);
175
    }
176
    aio_context_release(blk_get_aio_context(s->common.blk));
177 178
}

179 180 181 182 183 184 185 186
static inline void mirror_clip_sectors(MirrorBlockJob *s,
                                       int64_t sector_num,
                                       int *nb_sectors)
{
    *nb_sectors = MIN(*nb_sectors,
                      s->bdev_length / BDRV_SECTOR_SIZE - sector_num);
}

F
Fam Zheng 已提交
187 188 189 190 191
/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
 * return the offset of the adjusted tail sector against original. */
static int mirror_cow_align(MirrorBlockJob *s,
                            int64_t *sector_num,
                            int *nb_sectors)
P
Paolo Bonzini 已提交
192
{
F
Fam Zheng 已提交
193 194 195 196 197 198 199 200 201 202 203
    bool need_cow;
    int ret = 0;
    int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS;
    int64_t align_sector_num = *sector_num;
    int align_nb_sectors = *nb_sectors;
    int max_sectors = chunk_sectors * s->max_iov;

    need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap);
    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                          s->cow_bitmap);
    if (need_cow) {
204 205 206
        bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
                                       *nb_sectors, &align_sector_num,
                                       &align_nb_sectors);
F
Fam Zheng 已提交
207
    }
208

F
Fam Zheng 已提交
209 210 211 212 213 214
    if (align_nb_sectors > max_sectors) {
        align_nb_sectors = max_sectors;
        if (need_cow) {
            align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors,
                                               s->target_cluster_sectors);
        }
215
    }
216 217 218
    /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but
     * that doesn't matter because it's already the end of source image. */
    mirror_clip_sectors(s, align_sector_num, &align_nb_sectors);
219

F
Fam Zheng 已提交
220 221 222 223 224 225 226
    ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
    *sector_num = align_sector_num;
    *nb_sectors = align_nb_sectors;
    assert(ret >= 0);
    return ret;
}

F
Fam Zheng 已提交
227 228 229 230 231 232 233 234
static inline void mirror_wait_for_io(MirrorBlockJob *s)
{
    assert(!s->waiting_for_io);
    s->waiting_for_io = true;
    qemu_coroutine_yield();
    s->waiting_for_io = false;
}

F
Fam Zheng 已提交
235
/* Submit async read while handling COW.
236 237 238
 * Returns: The number of sectors copied after and including sector_num,
 *          excluding any sectors copied prior to sector_num due to alignment.
 *          This will be nb_sectors if no alignment is necessary, or
F
Fam Zheng 已提交
239 240 241 242 243 244
 *          (new_end - sector_num) if tail is rounded up or down due to
 *          alignment or buffer limit.
 */
static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                          int nb_sectors)
{
K
Kevin Wolf 已提交
245
    BlockBackend *source = s->common.blk;
F
Fam Zheng 已提交
246
    int sectors_per_chunk, nb_chunks;
247
    int ret;
F
Fam Zheng 已提交
248
    MirrorOp *op;
249
    int max_sectors;
F
Fam Zheng 已提交
250

251
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
252
    max_sectors = sectors_per_chunk * s->max_iov;
253

F
Fam Zheng 已提交
254 255
    /* We can only handle as much as buf_size at a time. */
    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
256
    nb_sectors = MIN(max_sectors, nb_sectors);
F
Fam Zheng 已提交
257
    assert(nb_sectors);
258
    ret = nb_sectors;
259

F
Fam Zheng 已提交
260 261 262 263 264 265 266 267
    if (s->cow_bitmap) {
        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
    }
    assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size);
    /* The sector range must meet granularity because:
     * 1) Caller passes in aligned values;
     * 2) mirror_cow_align is used only when target cluster is larger. */
    assert(!(sector_num % sectors_per_chunk));
268
    nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk);
F
Fam Zheng 已提交
269 270

    while (s->buf_free_count < nb_chunks) {
271
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
F
Fam Zheng 已提交
272
        mirror_wait_for_io(s);
273 274
    }

275
    /* Allocate a MirrorOp that is used as an AIO callback.  */
276
    op = g_new(MirrorOp, 1);
277 278 279
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;
280 281 282 283 284 285 286

    /* Now make a QEMUIOVector taking enough granularity-sized chunks
     * from s->buf_free.
     */
    qemu_iovec_init(&op->qiov, nb_chunks);
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
F
Fam Zheng 已提交
287
        size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size;
288

289 290
        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
291
        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
292
    }
293

P
Paolo Bonzini 已提交
294
    /* Copy the dirty cluster.  */
295
    s->in_flight++;
M
Max Reitz 已提交
296
    s->sectors_in_flight += nb_sectors;
297
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
298

299
    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
F
Fam Zheng 已提交
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
                   mirror_read_complete, op);
    return ret;
}

static void mirror_do_zero_or_discard(MirrorBlockJob *s,
                                      int64_t sector_num,
                                      int nb_sectors,
                                      bool is_discard)
{
    MirrorOp *op;

    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
     * so the freeing in mirror_iteration_done is nop. */
    op = g_new0(MirrorOp, 1);
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;

    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    if (is_discard) {
321 322 323
        blk_aio_pdiscard(s->target, sector_num << BDRV_SECTOR_BITS,
                         op->nb_sectors << BDRV_SECTOR_BITS,
                         mirror_write_complete, op);
F
Fam Zheng 已提交
324
    } else {
K
Kevin Wolf 已提交
325 326
        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
                              op->nb_sectors * BDRV_SECTOR_SIZE,
327 328
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
F
Fam Zheng 已提交
329 330 331 332 333
    }
}

static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
{
334
    BlockDriverState *source = s->source;
335
    int64_t sector_num, first_chunk;
F
Fam Zheng 已提交
336 337 338 339 340
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
    int nb_chunks = 1;
    int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
    int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
341
    bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
342 343
    int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT,
                             MAX_IO_SECTORS);
F
Fam Zheng 已提交
344

345
    bdrv_dirty_bitmap_lock(s->dirty_bitmap);
346
    sector_num = bdrv_dirty_iter_next(s->dbi);
F
Fam Zheng 已提交
347
    if (sector_num < 0) {
348 349
        bdrv_set_dirty_iter(s->dbi, 0);
        sector_num = bdrv_dirty_iter_next(s->dbi);
F
Fam Zheng 已提交
350 351 352
        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
        assert(sector_num >= 0);
    }
353
    bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
F
Fam Zheng 已提交
354

355 356
    first_chunk = sector_num / sectors_per_chunk;
    while (test_bit(first_chunk, s->in_flight_bitmap)) {
357
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
358 359 360
        mirror_wait_for_io(s);
    }

361 362
    block_job_pause_point(&s->common);

F
Fam Zheng 已提交
363 364
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
365
    bdrv_dirty_bitmap_lock(s->dirty_bitmap);
F
Fam Zheng 已提交
366
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
367
        int64_t next_dirty;
F
Fam Zheng 已提交
368 369 370
        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
        int64_t next_chunk = next_sector / sectors_per_chunk;
        if (next_sector >= end ||
371
            !bdrv_get_dirty_locked(source, s->dirty_bitmap, next_sector)) {
F
Fam Zheng 已提交
372 373 374
            break;
        }
        if (test_bit(next_chunk, s->in_flight_bitmap)) {
375
            break;
F
Fam Zheng 已提交
376
        }
377

378 379
        next_dirty = bdrv_dirty_iter_next(s->dbi);
        if (next_dirty > next_sector || next_dirty < 0) {
380
            /* The bitmap iterator's cache is stale, refresh it */
381 382
            bdrv_set_dirty_iter(s->dbi, next_sector);
            next_dirty = bdrv_dirty_iter_next(s->dbi);
383
        }
384
        assert(next_dirty == next_sector);
385
        nb_chunks++;
F
Fam Zheng 已提交
386 387 388 389 390 391
    }

    /* Clear dirty bits before querying the block status, because
     * calling bdrv_get_block_status_above could yield - if some blocks are
     * marked dirty in this window, we need to know.
     */
392 393 394 395
    bdrv_reset_dirty_bitmap_locked(s->dirty_bitmap, sector_num,
                                  nb_chunks * sectors_per_chunk);
    bdrv_dirty_bitmap_unlock(s->dirty_bitmap);

F
Fam Zheng 已提交
396 397
    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
    while (nb_chunks > 0 && sector_num < end) {
398
        int64_t ret;
399
        int io_sectors, io_sectors_acct;
F
Fam Zheng 已提交
400 401 402 403 404 405 406 407 408 409 410 411
        BlockDriverState *file;
        enum MirrorMethod {
            MIRROR_METHOD_COPY,
            MIRROR_METHOD_ZERO,
            MIRROR_METHOD_DISCARD
        } mirror_method = MIRROR_METHOD_COPY;

        assert(!(sector_num % sectors_per_chunk));
        ret = bdrv_get_block_status_above(source, NULL, sector_num,
                                          nb_chunks * sectors_per_chunk,
                                          &io_sectors, &file);
        if (ret < 0) {
412 413 414
            io_sectors = MIN(nb_chunks * sectors_per_chunk, max_io_sectors);
        } else if (ret & BDRV_BLOCK_DATA) {
            io_sectors = MIN(io_sectors, max_io_sectors);
F
Fam Zheng 已提交
415 416 417 418 419 420 421 422
        }

        io_sectors -= io_sectors % sectors_per_chunk;
        if (io_sectors < sectors_per_chunk) {
            io_sectors = sectors_per_chunk;
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_sector_num;
            int target_nb_sectors;
423 424 425
            bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
                                           io_sectors,  &target_sector_num,
                                           &target_nb_sectors);
F
Fam Zheng 已提交
426 427 428 429 430 431 432 433
            if (target_sector_num == sector_num &&
                target_nb_sectors == io_sectors) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
                                    MIRROR_METHOD_ZERO :
                                    MIRROR_METHOD_DISCARD;
            }
        }

434 435 436 437 438
        while (s->in_flight >= MAX_IN_FLIGHT) {
            trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
            mirror_wait_for_io(s);
        }

439 440 441 442
        if (s->ret < 0) {
            return 0;
        }

443
        mirror_clip_sectors(s, sector_num, &io_sectors);
F
Fam Zheng 已提交
444 445 446
        switch (mirror_method) {
        case MIRROR_METHOD_COPY:
            io_sectors = mirror_do_read(s, sector_num, io_sectors);
447
            io_sectors_acct = io_sectors;
F
Fam Zheng 已提交
448 449 450
            break;
        case MIRROR_METHOD_ZERO:
        case MIRROR_METHOD_DISCARD:
451 452 453 454 455 456 457
            mirror_do_zero_or_discard(s, sector_num, io_sectors,
                                      mirror_method == MIRROR_METHOD_DISCARD);
            if (write_zeroes_ok) {
                io_sectors_acct = 0;
            } else {
                io_sectors_acct = io_sectors;
            }
F
Fam Zheng 已提交
458 459 460 461 462 463
            break;
        default:
            abort();
        }
        assert(io_sectors);
        sector_num += io_sectors;
464
        nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
465
        if (s->common.speed) {
466
            delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors_acct);
467
        }
468
    }
469
    return delay_ns;
470
}
471

472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488
static void mirror_free_init(MirrorBlockJob *s)
{
    int granularity = s->granularity;
    size_t buf_size = s->buf_size;
    uint8_t *buf = s->buf;

    assert(s->buf_free_count == 0);
    QSIMPLEQ_INIT(&s->buf_free);
    while (buf_size != 0) {
        MirrorBuffer *cur = (MirrorBuffer *)buf;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
        s->buf_free_count++;
        buf_size -= granularity;
        buf += granularity;
    }
}

489 490 491 492 493
/* This is also used for the .pause callback. There is no matching
 * mirror_resume() because mirror_run() will begin iterating again
 * when the job is resumed.
 */
static void mirror_wait_for_all_io(MirrorBlockJob *s)
494 495
{
    while (s->in_flight > 0) {
F
Fam Zheng 已提交
496
        mirror_wait_for_io(s);
497
    }
P
Paolo Bonzini 已提交
498 499
}

500 501 502 503 504 505 506 507 508
typedef struct {
    int ret;
} MirrorExitData;

static void mirror_exit(BlockJob *job, void *opaque)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
509
    BlockDriverState *src = s->source;
K
Kevin Wolf 已提交
510
    BlockDriverState *target_bs = blk_bs(s->target);
511
    BlockDriverState *mirror_top_bs = s->mirror_top_bs;
512
    Error *local_err = NULL;
513

514 515
    bdrv_release_dirty_bitmap(src, s->dirty_bitmap);

516 517 518
    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
    bdrv_ref(src);
519
    bdrv_ref(mirror_top_bs);
520 521 522 523
    bdrv_ref(target_bs);

    /* Remove target parent that still uses BLK_PERM_WRITE/RESIZE before
     * inserting target_bs at s->to_replace, where we might not be able to get
524 525 526 527 528 529
     * these permissions.
     *
     * Note that blk_unref() alone doesn't necessarily drop permissions because
     * we might be running nested inside mirror_drain(), which takes an extra
     * reference, so use an explicit blk_set_perm() first. */
    blk_set_perm(s->target, 0, BLK_PERM_ALL, &error_abort);
530 531
    blk_unref(s->target);
    s->target = NULL;
532 533 534 535 536 537 538 539

    /* We don't access the source any more. Dropping any WRITE/RESIZE is
     * required before it could become a backing file of target_bs. */
    bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL,
                            &error_abort);
    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
        BlockDriverState *backing = s->is_none_mode ? src : s->base;
        if (backing_bs(target_bs) != backing) {
540 541 542 543 544
            bdrv_set_backing_hd(target_bs, backing, &local_err);
            if (local_err) {
                error_report_err(local_err);
                data->ret = -EPERM;
            }
545 546
        }
    }
547 548 549 550 551 552 553

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);
    }

    if (s->should_complete && data->ret == 0) {
K
Kevin Wolf 已提交
554
        BlockDriverState *to_replace = src;
555 556 557
        if (s->to_replace) {
            to_replace = s->to_replace;
        }
558

K
Kevin Wolf 已提交
559 560
        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
561
        }
562 563 564

        /* The mirror job has no requests in flight any more, but we need to
         * drain potential other users of the BDS before changing the graph. */
K
Kevin Wolf 已提交
565
        bdrv_drained_begin(target_bs);
566
        bdrv_replace_node(to_replace, target_bs, &local_err);
K
Kevin Wolf 已提交
567
        bdrv_drained_end(target_bs);
568 569 570 571
        if (local_err) {
            error_report_err(local_err);
            data->ret = -EPERM;
        }
572 573 574 575 576 577 578 579 580 581
    }
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
        bdrv_unref(s->to_replace);
    }
    if (replace_aio_context) {
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
582
    bdrv_unref(target_bs);
583 584 585

    /* Remove the mirror filter driver from the graph. Before this, get rid of
     * the blockers on the intermediate nodes so that the resulting state is
586 587
     * valid. Also give up permissions on mirror_top_bs->backing, which might
     * block the removal. */
588
    block_job_remove_all_bdrv(job);
589 590
    bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL,
                            &error_abort);
591
    bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort);
592 593

    /* We just changed the BDS the job BB refers to (with either or both of the
594 595
     * bdrv_replace_node() calls), so switch the BB back so the cleanup does
     * the right thing. We don't need any permissions any more now. */
596 597 598 599
    blk_remove_bs(job->blk);
    blk_set_perm(job->blk, 0, BLK_PERM_ALL, &error_abort);
    blk_insert_bs(job->blk, mirror_top_bs, &error_abort);

600
    block_job_completed(&s->common, data->ret);
601

602
    g_free(data);
603
    bdrv_drained_end(src);
604
    bdrv_unref(mirror_top_bs);
605
    bdrv_unref(src);
606 607
}

608 609 610 611 612 613 614 615 616 617 618 619
static void mirror_throttle(MirrorBlockJob *s)
{
    int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);

    if (now - s->last_pause_ns > SLICE_TIME) {
        s->last_pause_ns = now;
        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
    } else {
        block_job_pause_point(&s->common);
    }
}

620 621 622 623
static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
{
    int64_t sector_num, end;
    BlockDriverState *base = s->base;
624
    BlockDriverState *bs = s->source;
625 626 627 628 629
    BlockDriverState *target_bs = blk_bs(s->target);
    int ret, n;

    end = s->bdev_length / BDRV_SECTOR_SIZE;

630
    if (base == NULL && !bdrv_has_zero_init(target_bs)) {
631 632 633 634 635
        if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
            bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, end);
            return 0;
        }

636
        s->initial_zeroing_ongoing = true;
637 638 639 640 641 642 643
        for (sector_num = 0; sector_num < end; ) {
            int nb_sectors = MIN(end - sector_num,
                QEMU_ALIGN_DOWN(INT_MAX, s->granularity) >> BDRV_SECTOR_BITS);

            mirror_throttle(s);

            if (block_job_is_cancelled(&s->common)) {
644
                s->initial_zeroing_ongoing = false;
645 646 647 648
                return 0;
            }

            if (s->in_flight >= MAX_IN_FLIGHT) {
649 650
                trace_mirror_yield(s, UINT64_MAX, s->buf_free_count,
                                   s->in_flight);
651 652 653 654 655 656 657 658
                mirror_wait_for_io(s);
                continue;
            }

            mirror_do_zero_or_discard(s, sector_num, nb_sectors, false);
            sector_num += nb_sectors;
        }

659
        mirror_wait_for_all_io(s);
660
        s->initial_zeroing_ongoing = false;
661 662
    }

663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680
    /* First part, loop on the sectors and initialize the dirty bitmap.  */
    for (sector_num = 0; sector_num < end; ) {
        /* Just to make sure we are not exceeding int limit. */
        int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
                             end - sector_num);

        mirror_throttle(s);

        if (block_job_is_cancelled(&s->common)) {
            return 0;
        }

        ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
        if (ret < 0) {
            return ret;
        }

        assert(n > 0);
681
        if (ret == 1) {
682 683 684 685 686 687 688
            bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
        }
        sector_num += n;
    }
    return 0;
}

689 690 691 692 693 694 695 696 697 698 699 700 701 702
/* Called when going out of the streaming phase to flush the bulk of the
 * data to the medium, or just before completing.
 */
static int mirror_flush(MirrorBlockJob *s)
{
    int ret = blk_flush(s->target);
    if (ret < 0) {
        if (mirror_error_action(s, false, -ret) == BLOCK_ERROR_ACTION_REPORT) {
            s->ret = ret;
        }
    }
    return ret;
}

P
Paolo Bonzini 已提交
703 704 705
static void coroutine_fn mirror_run(void *opaque)
{
    MirrorBlockJob *s = opaque;
706
    MirrorExitData *data;
707
    BlockDriverState *bs = s->source;
K
Kevin Wolf 已提交
708
    BlockDriverState *target_bs = blk_bs(s->target);
709
    bool need_drain = true;
710
    int64_t length;
711
    BlockDriverInfo bdi;
712 713
    char backing_filename[2]; /* we only need 2 characters because we are only
                                 checking for a NULL string */
P
Paolo Bonzini 已提交
714
    int ret = 0;
F
Fam Zheng 已提交
715
    int target_cluster_size = BDRV_SECTOR_SIZE;
P
Paolo Bonzini 已提交
716 717 718 719 720

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
    }

M
Max Reitz 已提交
721 722 723
    s->bdev_length = bdrv_getlength(bs);
    if (s->bdev_length < 0) {
        ret = s->bdev_length;
724
        goto immediate_exit;
725 726 727 728 729 730 731 732 733 734 735 736 737 738
    }

    /* Active commit must resize the base image if its size differs from the
     * active layer. */
    if (s->base == blk_bs(s->target)) {
        int64_t base_length;

        base_length = blk_getlength(s->target);
        if (base_length < 0) {
            ret = base_length;
            goto immediate_exit;
        }

        if (s->bdev_length > base_length) {
739
            ret = blk_truncate(s->target, s->bdev_length, NULL);
740 741 742 743 744 745 746
            if (ret < 0) {
                goto immediate_exit;
            }
        }
    }

    if (s->bdev_length == 0) {
747 748 749 750 751 752 753 754
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
        while (!block_job_is_cancelled(&s->common) && !s->should_complete) {
            block_job_yield(&s->common);
        }
        s->common.cancelled = false;
        goto immediate_exit;
P
Paolo Bonzini 已提交
755 756
    }

M
Max Reitz 已提交
757
    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
758 759
    s->in_flight_bitmap = bitmap_new(length);

760 761 762 763
    /* If we have no backing file yet in the destination, we cannot let
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
K
Kevin Wolf 已提交
764
    bdrv_get_backing_filename(target_bs, backing_filename,
765
                              sizeof(backing_filename));
K
Kevin Wolf 已提交
766
    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
F
Fam Zheng 已提交
767 768
        target_cluster_size = bdi.cluster_size;
    }
K
Kevin Wolf 已提交
769
    if (backing_filename[0] && !target_bs->backing
F
Fam Zheng 已提交
770 771 772
        && s->granularity < target_cluster_size) {
        s->buf_size = MAX(s->buf_size, target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
773
    }
F
Fam Zheng 已提交
774
    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
K
Kevin Wolf 已提交
775
    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
776

777 778 779 780 781 782
    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
        ret = -ENOMEM;
        goto immediate_exit;
    }

783
    mirror_free_init(s);
P
Paolo Bonzini 已提交
784

785
    s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
F
Fam Zheng 已提交
786
    if (!s->is_none_mode) {
787 788 789
        ret = mirror_dirty_init(s);
        if (ret < 0 || block_job_is_cancelled(&s->common)) {
            goto immediate_exit;
P
Paolo Bonzini 已提交
790 791 792
        }
    }

793 794
    assert(!s->dbi);
    s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap, 0);
P
Paolo Bonzini 已提交
795
    for (;;) {
796
        uint64_t delay_ns = 0;
797
        int64_t cnt, delta;
P
Paolo Bonzini 已提交
798 799
        bool should_complete;

800 801 802 803 804
        if (s->ret < 0) {
            ret = s->ret;
            goto immediate_exit;
        }

805 806
        block_job_pause_point(&s->common);

807
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
M
Max Reitz 已提交
808 809 810 811 812 813
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
         * s->sectors_in_flight is the number of sectors currently being
         * processed; together those are the current total operation length */
        s->common.len = s->common.offset +
                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
814 815

        /* Note that even when no rate limit is applied we need to yield
816
         * periodically with no pending I/O so that bdrv_drain_all() returns.
817 818 819
         * We do so every SLICE_TIME nanoseconds, or when there is an error,
         * or when the source is clean, whichever comes first.
         */
820 821
        delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
        if (delta < SLICE_TIME &&
822
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
823
            if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
824
                (cnt == 0 && s->in_flight > 0)) {
825
                trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
F
Fam Zheng 已提交
826
                mirror_wait_for_io(s);
827 828
                continue;
            } else if (cnt != 0) {
829
                delay_ns = mirror_iteration(s);
P
Paolo Bonzini 已提交
830 831 832 833
            }
        }

        should_complete = false;
834
        if (s->in_flight == 0 && cnt == 0) {
P
Paolo Bonzini 已提交
835
            trace_mirror_before_flush(s);
836 837 838 839
            if (!s->synced) {
                if (mirror_flush(s) < 0) {
                    /* Go check s->ret.  */
                    continue;
840 841 842 843 844 845
                }
                /* We're out of the streaming phase.  From now on, if the job
                 * is cancelled we will actually complete all pending I/O and
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
846 847
                block_job_event_ready(&s->common);
                s->synced = true;
P
Paolo Bonzini 已提交
848
            }
849 850 851 852

            should_complete = s->should_complete ||
                block_job_is_cancelled(&s->common);
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
853 854 855 856 857 858 859 860 861
        }

        if (cnt == 0 && should_complete) {
            /* The dirty bitmap is not updated while operations are pending.
             * If we're about to exit, wait for pending operations before
             * calling bdrv_get_dirty_count(bs), or we may exit while the
             * source has dirty data to copy!
             *
             * Note that I/O can be submitted by the guest while
862 863 864
             * mirror_populate runs, so pause it now.  Before deciding
             * whether to switch to target check one last time if I/O has
             * come in the meanwhile, and if not flush the data to disk.
P
Paolo Bonzini 已提交
865 866
             */
            trace_mirror_before_drain(s, cnt);
867 868

            bdrv_drained_begin(bs);
869
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
870
            if (cnt > 0 || mirror_flush(s) < 0) {
871 872 873 874 875 876 877 878 879 880 881
                bdrv_drained_end(bs);
                continue;
            }

            /* The two disks are in sync.  Exit and report successful
             * completion.
             */
            assert(QLIST_EMPTY(&bs->tracked_requests));
            s->common.cancelled = false;
            need_drain = false;
            break;
P
Paolo Bonzini 已提交
882 883 884
        }

        ret = 0;
885
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
P
Paolo Bonzini 已提交
886
        if (!s->synced) {
887
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
888 889 890 891
            if (block_job_is_cancelled(&s->common)) {
                break;
            }
        } else if (!should_complete) {
892
            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
893
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
894
        }
895
        s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
P
Paolo Bonzini 已提交
896 897 898
    }

immediate_exit:
899 900 901 902 903 904
    if (s->in_flight > 0) {
        /* We get here only if something went wrong.  Either the job failed,
         * or it was cancelled prematurely so that we do not guarantee that
         * the target is a copy of the source.
         */
        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
905
        assert(need_drain);
906
        mirror_wait_for_all_io(s);
907 908 909
    }

    assert(s->in_flight == 0);
910
    qemu_vfree(s->buf);
911
    g_free(s->cow_bitmap);
912
    g_free(s->in_flight_bitmap);
913
    bdrv_dirty_iter_free(s->dbi);
914 915 916

    data = g_malloc(sizeof(*data));
    data->ret = ret;
917 918 919 920

    if (need_drain) {
        bdrv_drained_begin(bs);
    }
921
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
P
Paolo Bonzini 已提交
922 923 924 925 926 927 928
}

static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    if (speed < 0) {
929
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
P
Paolo Bonzini 已提交
930 931 932 933 934
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

P
Paolo Bonzini 已提交
935 936 937
static void mirror_complete(BlockJob *job, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
938
    BlockDriverState *target;
M
Max Reitz 已提交
939 940

    target = blk_bs(s->target);
P
Paolo Bonzini 已提交
941 942

    if (!s->synced) {
943 944
        error_setg(errp, "The active block job '%s' cannot be completed",
                   job->id);
P
Paolo Bonzini 已提交
945 946 947
        return;
    }

M
Max Reitz 已提交
948 949 950 951 952 953 954 955 956 957
    if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
        int ret;

        assert(!target->backing);
        ret = bdrv_open_backing_file(target, NULL, "backing", errp);
        if (ret < 0) {
            return;
        }
    }

C
Changlong Xie 已提交
958
    /* block all operations on to_replace bs */
959
    if (s->replaces) {
960 961
        AioContext *replace_aio_context;

962
        s->to_replace = bdrv_find_node(s->replaces);
963
        if (!s->to_replace) {
964
            error_setg(errp, "Node name '%s' not found", s->replaces);
965 966 967
            return;
        }

968 969 970
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);

971 972 973 974
        /* TODO Translate this into permission system. Current definition of
         * GRAPH_MOD would require to request it for the parents; they might
         * not even be BlockDriverStates, however, so a BdrvChild can't address
         * them. May need redefinition of GRAPH_MOD. */
975 976 977 978
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
979 980

        aio_context_release(replace_aio_context);
981 982
    }

P
Paolo Bonzini 已提交
983
    s->should_complete = true;
F
Fam Zheng 已提交
984
    block_job_enter(&s->common);
P
Paolo Bonzini 已提交
985 986
}

987
static void mirror_pause(BlockJob *job)
988 989 990
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

991
    mirror_wait_for_all_io(s);
992 993 994 995 996 997 998 999 1000
}

static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    blk_set_aio_context(s->target, new_context);
}

1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015
static void mirror_drain(BlockJob *job)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    /* Need to keep a reference in case blk_drain triggers execution
     * of mirror_complete...
     */
    if (s->target) {
        BlockBackend *target = s->target;
        blk_ref(target);
        blk_drain(target);
        blk_unref(target);
    }
}

1016
static const BlockJobDriver mirror_job_driver = {
1017 1018 1019
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_MIRROR,
    .set_speed              = mirror_set_speed,
J
John Snow 已提交
1020
    .start                  = mirror_run,
1021 1022 1023
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
1024
    .drain                  = mirror_drain,
P
Paolo Bonzini 已提交
1025 1026
};

F
Fam Zheng 已提交
1027
static const BlockJobDriver commit_active_job_driver = {
1028 1029 1030
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_COMMIT,
    .set_speed              = mirror_set_speed,
J
John Snow 已提交
1031
    .start                  = mirror_run,
1032 1033 1034
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
1035
    .drain                  = mirror_drain,
F
Fam Zheng 已提交
1036 1037
};

1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065
static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
{
    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
}

static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
{
    return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
}

static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
{
    return bdrv_co_flush(bs->backing->bs);
}

static int64_t coroutine_fn bdrv_mirror_top_get_block_status(
    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
    BlockDriverState **file)
{
    *pnum = nb_sectors;
    *file = bs->backing->bs;
    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
           (sector_num << BDRV_SECTOR_BITS);
}

static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
1066
    int64_t offset, int bytes, BdrvRequestFlags flags)
1067
{
1068
    return bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
1069 1070 1071
}

static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,
1072
    int64_t offset, int bytes)
1073
{
1074
    return bdrv_co_pdiscard(bs->backing->bs, offset, bytes);
1075 1076
}

1077 1078 1079 1080 1081 1082 1083
static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs, QDict *opts)
{
    bdrv_refresh_filename(bs->backing->bs);
    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
            bs->backing->bs->filename);
}

1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111
static void bdrv_mirror_top_close(BlockDriverState *bs)
{
}

static void bdrv_mirror_top_child_perm(BlockDriverState *bs, BdrvChild *c,
                                       const BdrvChildRole *role,
                                       uint64_t perm, uint64_t shared,
                                       uint64_t *nperm, uint64_t *nshared)
{
    /* Must be able to forward guest writes to the real image */
    *nperm = 0;
    if (perm & BLK_PERM_WRITE) {
        *nperm |= BLK_PERM_WRITE;
    }

    *nshared = BLK_PERM_ALL;
}

/* Dummy node that provides consistent read to its users without requiring it
 * from its backing file and that allows writes on the backing file chain. */
static BlockDriver bdrv_mirror_top = {
    .format_name                = "mirror_top",
    .bdrv_co_preadv             = bdrv_mirror_top_preadv,
    .bdrv_co_pwritev            = bdrv_mirror_top_pwritev,
    .bdrv_co_pwrite_zeroes      = bdrv_mirror_top_pwrite_zeroes,
    .bdrv_co_pdiscard           = bdrv_mirror_top_pdiscard,
    .bdrv_co_flush              = bdrv_mirror_top_flush,
    .bdrv_co_get_block_status   = bdrv_mirror_top_get_block_status,
1112
    .bdrv_refresh_filename      = bdrv_mirror_top_refresh_filename,
1113 1114 1115 1116
    .bdrv_close                 = bdrv_mirror_top_close,
    .bdrv_child_perm            = bdrv_mirror_top_child_perm,
};

1117
static void mirror_start_job(const char *job_id, BlockDriverState *bs,
1118 1119 1120
                             int creation_flags, BlockDriverState *target,
                             const char *replaces, int64_t speed,
                             uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
1121
                             BlockMirrorBackingMode backing_mode,
1122 1123
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
1124
                             bool unmap,
1125
                             BlockCompletionFunc *cb,
1126
                             void *opaque,
1127
                             const BlockJobDriver *driver,
1128
                             bool is_none_mode, BlockDriverState *base,
1129 1130
                             bool auto_complete, const char *filter_node_name,
                             Error **errp)
P
Paolo Bonzini 已提交
1131 1132
{
    MirrorBlockJob *s;
1133 1134 1135
    BlockDriverState *mirror_top_bs;
    bool target_graph_mod;
    bool target_is_backing;
1136
    Error *local_err = NULL;
1137
    int ret;
P
Paolo Bonzini 已提交
1138

1139
    if (granularity == 0) {
1140
        granularity = bdrv_get_default_bitmap_granularity(target);
1141 1142 1143 1144
    }

    assert ((granularity & (granularity - 1)) == 0);

W
Wen Congyang 已提交
1145 1146 1147 1148 1149 1150 1151 1152
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
    }

    if (buf_size == 0) {
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }
F
Fam Zheng 已提交
1153

1154 1155 1156
    /* In the case of active commit, add dummy driver to provide consistent
     * reads on the top, while disabling it in the intermediate nodes, and make
     * the backing chain writable. */
1157 1158
    mirror_top_bs = bdrv_new_open_driver(&bdrv_mirror_top, filter_node_name,
                                         BDRV_O_RDWR, errp);
1159 1160 1161 1162
    if (mirror_top_bs == NULL) {
        return;
    }
    mirror_top_bs->total_sectors = bs->total_sectors;
1163
    bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs));
1164 1165

    /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
M
Max Reitz 已提交
1166
     * it alive until block_job_create() succeeds even if bs has no parent. */
1167 1168
    bdrv_ref(mirror_top_bs);
    bdrv_drained_begin(bs);
1169
    bdrv_append(mirror_top_bs, bs, &local_err);
1170 1171
    bdrv_drained_end(bs);

1172 1173 1174 1175 1176 1177
    if (local_err) {
        bdrv_unref(mirror_top_bs);
        error_propagate(errp, local_err);
        return;
    }

1178 1179 1180 1181 1182
    /* Make sure that the source is not resized while the job is running */
    s = block_job_create(job_id, driver, mirror_top_bs,
                         BLK_PERM_CONSISTENT_READ,
                         BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
                         BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD, speed,
1183
                         creation_flags, cb, opaque, errp);
P
Paolo Bonzini 已提交
1184
    if (!s) {
1185
        goto fail;
P
Paolo Bonzini 已提交
1186
    }
M
Max Reitz 已提交
1187 1188 1189
    /* The block job now has a reference to this node */
    bdrv_unref(mirror_top_bs);

1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208
    s->source = bs;
    s->mirror_top_bs = mirror_top_bs;

    /* No resize for the target either; while the mirror is still running, a
     * consistent read isn't necessarily possible. We could possibly allow
     * writes and graph modifications, though it would likely defeat the
     * purpose of a mirror, so leave them blocked for now.
     *
     * In the case of active commit, things look a bit different, though,
     * because the target is an already populated backing file in active use.
     * We can allow anything except resize there.*/
    target_is_backing = bdrv_chain_contains(bs, target);
    target_graph_mod = (backing_mode != MIRROR_LEAVE_BACKING_CHAIN);
    s->target = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE |
                        (target_graph_mod ? BLK_PERM_GRAPH_MOD : 0),
                        BLK_PERM_WRITE_UNCHANGED |
                        (target_is_backing ? BLK_PERM_CONSISTENT_READ |
                                             BLK_PERM_WRITE |
                                             BLK_PERM_GRAPH_MOD : 0));
1209 1210
    ret = blk_insert_bs(s->target, target, errp);
    if (ret < 0) {
1211
        goto fail;
1212
    }
K
Kevin Wolf 已提交
1213

1214
    s->replaces = g_strdup(replaces);
1215 1216
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
F
Fam Zheng 已提交
1217
    s->is_none_mode = is_none_mode;
M
Max Reitz 已提交
1218
    s->backing_mode = backing_mode;
F
Fam Zheng 已提交
1219
    s->base = base;
1220
    s->granularity = granularity;
W
Wen Congyang 已提交
1221
    s->buf_size = ROUND_UP(buf_size, granularity);
1222
    s->unmap = unmap;
1223 1224 1225
    if (auto_complete) {
        s->should_complete = true;
    }
1226

1227
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
1228
    if (!s->dirty_bitmap) {
1229
        goto fail;
1230
    }
1231

1232
    /* Required permissions are already taken with blk_new() */
1233 1234 1235
    block_job_add_bdrv(&s->common, "target", target, 0, BLK_PERM_ALL,
                       &error_abort);

1236 1237
    /* In commit_active_start() all intermediate nodes disappear, so
     * any jobs in them must be blocked */
1238
    if (target_is_backing) {
1239 1240
        BlockDriverState *iter;
        for (iter = backing_bs(bs); iter != target; iter = backing_bs(iter)) {
1241 1242 1243 1244 1245 1246 1247 1248 1249 1250
            /* XXX BLK_PERM_WRITE needs to be allowed so we don't block
             * ourselves at s->base (if writes are blocked for a node, they are
             * also blocked for its backing file). The other options would be a
             * second filter driver above s->base (== target). */
            ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
                                     BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
                                     errp);
            if (ret < 0) {
                goto fail;
            }
1251 1252
        }
    }
1253

J
John Snow 已提交
1254 1255
    trace_mirror_start(bs, s, opaque);
    block_job_start(&s->common);
1256 1257 1258 1259
    return;

fail:
    if (s) {
M
Max Reitz 已提交
1260 1261 1262 1263
        /* Make sure this BDS does not go away until we have completed the graph
         * changes below */
        bdrv_ref(mirror_top_bs);

1264 1265
        g_free(s->replaces);
        blk_unref(s->target);
1266
        block_job_early_fail(&s->common);
1267 1268
    }

1269 1270
    bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL,
                            &error_abort);
1271
    bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort);
M
Max Reitz 已提交
1272 1273

    bdrv_unref(mirror_top_bs);
P
Paolo Bonzini 已提交
1274
}
F
Fam Zheng 已提交
1275

1276 1277
void mirror_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *target, const char *replaces,
1278
                  int64_t speed, uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
1279 1280
                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                  BlockdevOnError on_source_error,
F
Fam Zheng 已提交
1281
                  BlockdevOnError on_target_error,
1282
                  bool unmap, const char *filter_node_name, Error **errp)
F
Fam Zheng 已提交
1283 1284 1285 1286
{
    bool is_none_mode;
    BlockDriverState *base;

1287 1288
    if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        error_setg(errp, "Sync mode 'incremental' not supported");
1289 1290
        return;
    }
F
Fam Zheng 已提交
1291
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
1292
    base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
1293
    mirror_start_job(job_id, bs, BLOCK_JOB_DEFAULT, target, replaces,
M
Max Reitz 已提交
1294
                     speed, granularity, buf_size, backing_mode,
1295
                     on_source_error, on_target_error, unmap, NULL, NULL,
1296
                     &mirror_job_driver, is_none_mode, base, false,
1297
                     filter_node_name, errp);
F
Fam Zheng 已提交
1298 1299
}

1300
void commit_active_start(const char *job_id, BlockDriverState *bs,
1301 1302
                         BlockDriverState *base, int creation_flags,
                         int64_t speed, BlockdevOnError on_error,
1303
                         const char *filter_node_name,
1304 1305
                         BlockCompletionFunc *cb, void *opaque,
                         bool auto_complete, Error **errp)
F
Fam Zheng 已提交
1306
{
1307
    int orig_base_flags;
1308
    Error *local_err = NULL;
1309 1310 1311

    orig_base_flags = bdrv_get_flags(base);

F
Fam Zheng 已提交
1312 1313 1314
    if (bdrv_reopen(base, bs->open_flags, errp)) {
        return;
    }
1315

1316
    mirror_start_job(job_id, bs, creation_flags, base, NULL, speed, 0, 0,
1317
                     MIRROR_LEAVE_BACKING_CHAIN,
1318
                     on_error, on_error, true, cb, opaque,
1319
                     &commit_active_job_driver, false, base, auto_complete,
1320
                     filter_node_name, &local_err);
1321
    if (local_err) {
1322
        error_propagate(errp, local_err);
1323 1324 1325 1326 1327 1328 1329 1330 1331 1332
        goto error_restore_flags;
    }

    return;

error_restore_flags:
    /* ignore error and errp for bdrv_reopen, because we want to propagate
     * the original error */
    bdrv_reopen(base, orig_base_flags, NULL);
    return;
F
Fam Zheng 已提交
1333
}