mirror.c 27.1 KB
Newer Older
P
Paolo Bonzini 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * Image mirroring
 *
 * Copyright Red Hat, Inc. 2012
 *
 * Authors:
 *  Paolo Bonzini  <pbonzini@redhat.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

#include "trace.h"
15 16
#include "block/blockjob.h"
#include "block/block_int.h"
17
#include "qapi/qmp/qerror.h"
P
Paolo Bonzini 已提交
18
#include "qemu/ratelimit.h"
19
#include "qemu/bitmap.h"
P
Paolo Bonzini 已提交
20

21 22
#define SLICE_TIME    100000000ULL /* ns */
#define MAX_IN_FLIGHT 16
W
Wen Congyang 已提交
23
#define DEFAULT_MIRROR_BUF_SIZE   (10 << 20)
24 25 26 27 28 29 30

/* The mirroring buffer is a list of granularity-sized chunks.
 * Free chunks are organized in a list.
 */
typedef struct MirrorBuffer {
    QSIMPLEQ_ENTRY(MirrorBuffer) next;
} MirrorBuffer;
P
Paolo Bonzini 已提交
31 32 33 34 35

typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
    BlockDriverState *target;
F
Fam Zheng 已提交
36
    BlockDriverState *base;
37 38 39 40 41 42
    /* The name of the graph node to replace */
    char *replaces;
    /* The BDS to replace */
    BlockDriverState *to_replace;
    /* Used to block operations on the drive-mirror-replace target */
    Error *replace_blocker;
F
Fam Zheng 已提交
43
    bool is_none_mode;
44
    BlockdevOnError on_source_error, on_target_error;
P
Paolo Bonzini 已提交
45 46
    bool synced;
    bool should_complete;
P
Paolo Bonzini 已提交
47
    int64_t sector_num;
48
    int64_t granularity;
49
    size_t buf_size;
M
Max Reitz 已提交
50
    int64_t bdev_length;
51
    unsigned long *cow_bitmap;
F
Fam Zheng 已提交
52
    BdrvDirtyBitmap *dirty_bitmap;
53
    HBitmapIter hbi;
P
Paolo Bonzini 已提交
54
    uint8_t *buf;
55 56
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;
57

58
    unsigned long *in_flight_bitmap;
59
    int in_flight;
M
Max Reitz 已提交
60
    int sectors_in_flight;
61
    int ret;
62
    bool unmap;
K
Kevin Wolf 已提交
63
    bool waiting_for_io;
P
Paolo Bonzini 已提交
64 65
} MirrorBlockJob;

66 67 68 69 70 71 72
typedef struct MirrorOp {
    MirrorBlockJob *s;
    QEMUIOVector qiov;
    int64_t sector_num;
    int nb_sectors;
} MirrorOp;

73 74 75 76 77 78 79 80 81 82 83 84 85
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                            int error)
{
    s->synced = false;
    if (read) {
        return block_job_error_action(&s->common, s->common.bs,
                                      s->on_source_error, true, error);
    } else {
        return block_job_error_action(&s->common, s->target,
                                      s->on_target_error, false, error);
    }
}

86 87 88
static void mirror_iteration_done(MirrorOp *op, int ret)
{
    MirrorBlockJob *s = op->s;
89
    struct iovec *iov;
90
    int64_t chunk_num;
91
    int i, nb_chunks, sectors_per_chunk;
92 93 94 95

    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
M
Max Reitz 已提交
96
    s->sectors_in_flight -= op->nb_sectors;
97 98 99 100 101 102 103
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
        s->buf_free_count++;
    }

104 105 106
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    chunk_num = op->sector_num / sectors_per_chunk;
    nb_chunks = op->nb_sectors / sectors_per_chunk;
107
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
M
Max Reitz 已提交
108 109 110 111 112
    if (ret >= 0) {
        if (s->cow_bitmap) {
            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
        }
        s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
113 114
    }

Z
Zhang Min 已提交
115
    qemu_iovec_destroy(&op->qiov);
116
    g_slice_free(MirrorOp, op);
117

K
Kevin Wolf 已提交
118
    if (s->waiting_for_io) {
119 120
        qemu_coroutine_enter(s->common.co, NULL);
    }
121 122 123 124 125 126 127 128 129
}

static void mirror_write_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
    if (ret < 0) {
        BlockErrorAction action;

130
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
131
        action = mirror_error_action(s, false, -ret);
W
Wenchao Xia 已提交
132
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
133 134 135 136 137 138 139 140 141 142 143 144 145
            s->ret = ret;
        }
    }
    mirror_iteration_done(op, ret);
}

static void mirror_read_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
    if (ret < 0) {
        BlockErrorAction action;

146
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
147
        action = mirror_error_action(s, true, -ret);
W
Wenchao Xia 已提交
148
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
149 150 151 152 153 154 155 156 157 158
            s->ret = ret;
        }

        mirror_iteration_done(op, ret);
        return;
    }
    bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors,
                    mirror_write_complete, op);
}

159
static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
P
Paolo Bonzini 已提交
160 161
{
    BlockDriverState *source = s->common.bs;
162
    int nb_sectors, sectors_per_chunk, nb_chunks;
163
    int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
164
    uint64_t delay_ns = 0;
165
    MirrorOp *op;
166 167
    int pnum;
    int64_t ret;
P
Paolo Bonzini 已提交
168

169 170
    s->sector_num = hbitmap_iter_next(&s->hbi);
    if (s->sector_num < 0) {
171
        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
172
        s->sector_num = hbitmap_iter_next(&s->hbi);
173
        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
174 175 176
        assert(s->sector_num >= 0);
    }

177
    hbitmap_next_sector = s->sector_num;
178 179
    sector_num = s->sector_num;
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
M
Max Reitz 已提交
180
    end = s->bdev_length / BDRV_SECTOR_SIZE;
181

182 183
    /* Extend the QEMUIOVector to include all adjacent blocks that will
     * be copied in this operation.
184
     *
185 186 187 188 189 190 191 192 193
     * We have to do this if we have no backing file yet in the destination,
     * and the cluster size is very large.  Then we need to do COW ourselves.
     * The first time a cluster is copied, copy it entirely.  Note that,
     * because both the granularity and the cluster size are powers of two,
     * the number of sectors to copy cannot exceed one cluster.
     *
     * We also want to extend the QEMUIOVector to include more adjacent
     * dirty blocks if possible, to limit the number of I/O operations and
     * run efficiently even with a small granularity.
194
     */
195 196 197 198
    nb_chunks = 0;
    nb_sectors = 0;
    next_sector = sector_num;
    next_chunk = sector_num / sectors_per_chunk;
199 200

    /* Wait for I/O to this cluster (from a previous iteration) to be done.  */
201
    while (test_bit(next_chunk, s->in_flight_bitmap)) {
202
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
K
Kevin Wolf 已提交
203
        s->waiting_for_io = true;
204
        qemu_coroutine_yield();
K
Kevin Wolf 已提交
205
        s->waiting_for_io = false;
206 207
    }

208 209 210
    do {
        int added_sectors, added_chunks;

F
Fam Zheng 已提交
211
        if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) ||
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
            test_bit(next_chunk, s->in_flight_bitmap)) {
            assert(nb_sectors > 0);
            break;
        }

        added_sectors = sectors_per_chunk;
        if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) {
            bdrv_round_to_clusters(s->target,
                                   next_sector, added_sectors,
                                   &next_sector, &added_sectors);

            /* On the first iteration, the rounding may make us copy
             * sectors before the first dirty one.
             */
            if (next_sector < sector_num) {
                assert(nb_sectors == 0);
                sector_num = next_sector;
                next_chunk = next_sector / sectors_per_chunk;
            }
        }

        added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors));
        added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk;

        /* When doing COW, it may happen that there is not enough space for
         * a full cluster.  Wait if that is the case.
         */
        while (nb_chunks == 0 && s->buf_free_count < added_chunks) {
            trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
K
Kevin Wolf 已提交
241
            s->waiting_for_io = true;
242
            qemu_coroutine_yield();
K
Kevin Wolf 已提交
243
            s->waiting_for_io = false;
244 245 246 247 248
        }
        if (s->buf_free_count < nb_chunks + added_chunks) {
            trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight);
            break;
        }
249 250 251 252
        if (IOV_MAX < nb_chunks + added_chunks) {
            trace_mirror_break_iov_max(s, nb_chunks, added_chunks);
            break;
        }
253 254 255

        /* We have enough free space to copy these sectors.  */
        bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks);
256

257 258 259 260
        nb_sectors += added_sectors;
        nb_chunks += added_chunks;
        next_sector += added_sectors;
        next_chunk += added_chunks;
261 262 263 264
        if (!s->synced && s->common.speed) {
            delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors);
        }
    } while (delay_ns == 0 && next_sector < end);
265 266 267 268 269 270

    /* Allocate a MirrorOp that is used as an AIO callback.  */
    op = g_slice_new(MirrorOp);
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;
271 272 273 274 275 276 277 278

    /* Now make a QEMUIOVector taking enough granularity-sized chunks
     * from s->buf_free.
     */
    qemu_iovec_init(&op->qiov, nb_chunks);
    next_sector = sector_num;
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
279 280
        size_t remaining = (nb_sectors * BDRV_SECTOR_SIZE) - op->qiov.size;

281 282
        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
283
        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
284 285 286 287

        /* Advance the HBitmapIter in parallel, so that we do not examine
         * the same sector twice.
         */
F
Fam Zheng 已提交
288 289
        if (next_sector > hbitmap_next_sector
            && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
290 291 292 293 294
            hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
        }

        next_sector += sectors_per_chunk;
    }
295

296
    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors);
P
Paolo Bonzini 已提交
297 298

    /* Copy the dirty cluster.  */
299
    s->in_flight++;
M
Max Reitz 已提交
300
    s->sectors_in_flight += nb_sectors;
301
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317

    ret = bdrv_get_block_status_above(source, NULL, sector_num,
                                      nb_sectors, &pnum);
    if (ret < 0 || pnum < nb_sectors ||
            (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
        bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
                       mirror_read_complete, op);
    } else if (ret & BDRV_BLOCK_ZERO) {
        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
    } else {
        assert(!(ret & BDRV_BLOCK_DATA));
        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
                         mirror_write_complete, op);
    }
318
    return delay_ns;
319
}
320

321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337
static void mirror_free_init(MirrorBlockJob *s)
{
    int granularity = s->granularity;
    size_t buf_size = s->buf_size;
    uint8_t *buf = s->buf;

    assert(s->buf_free_count == 0);
    QSIMPLEQ_INIT(&s->buf_free);
    while (buf_size != 0) {
        MirrorBuffer *cur = (MirrorBuffer *)buf;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
        s->buf_free_count++;
        buf_size -= granularity;
        buf += granularity;
    }
}

338 339 340
static void mirror_drain(MirrorBlockJob *s)
{
    while (s->in_flight > 0) {
K
Kevin Wolf 已提交
341
        s->waiting_for_io = true;
342
        qemu_coroutine_yield();
K
Kevin Wolf 已提交
343
        s->waiting_for_io = false;
344
    }
P
Paolo Bonzini 已提交
345 346
}

347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
typedef struct {
    int ret;
} MirrorExitData;

static void mirror_exit(BlockJob *job, void *opaque)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);
    }

    if (s->should_complete && data->ret == 0) {
        BlockDriverState *to_replace = s->common.bs;
        if (s->to_replace) {
            to_replace = s->to_replace;
        }
        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
        }
        bdrv_swap(s->target, to_replace);
        if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) {
            /* drop the bs loop chain formed by the swap: break the loop then
             * trigger the unref from the top one */
            BlockDriverState *p = s->base->backing_hd;
            bdrv_set_backing_hd(s->base, NULL);
            bdrv_unref(p);
        }
    }
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
        bdrv_unref(s->to_replace);
    }
    if (replace_aio_context) {
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
    bdrv_unref(s->target);
    block_job_completed(&s->common, data->ret);
    g_free(data);
}

P
Paolo Bonzini 已提交
393 394 395
static void coroutine_fn mirror_run(void *opaque)
{
    MirrorBlockJob *s = opaque;
396
    MirrorExitData *data;
P
Paolo Bonzini 已提交
397
    BlockDriverState *bs = s->common.bs;
398
    int64_t sector_num, end, length;
399
    uint64_t last_pause_ns;
400
    BlockDriverInfo bdi;
401 402
    char backing_filename[2]; /* we only need 2 characters because we are only
                                 checking for a NULL string */
P
Paolo Bonzini 已提交
403 404 405 406 407 408 409
    int ret = 0;
    int n;

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
    }

M
Max Reitz 已提交
410 411 412
    s->bdev_length = bdrv_getlength(bs);
    if (s->bdev_length < 0) {
        ret = s->bdev_length;
413
        goto immediate_exit;
M
Max Reitz 已提交
414
    } else if (s->bdev_length == 0) {
415 416 417 418 419 420 421 422
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
        while (!block_job_is_cancelled(&s->common) && !s->should_complete) {
            block_job_yield(&s->common);
        }
        s->common.cancelled = false;
        goto immediate_exit;
P
Paolo Bonzini 已提交
423 424
    }

M
Max Reitz 已提交
425
    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
426 427
    s->in_flight_bitmap = bitmap_new(length);

428 429 430 431 432 433 434
    /* If we have no backing file yet in the destination, we cannot let
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
    bdrv_get_backing_filename(s->target, backing_filename,
                              sizeof(backing_filename));
    if (backing_filename[0] && !s->target->backing_hd) {
435 436 437 438
        ret = bdrv_get_info(s->target, &bdi);
        if (ret < 0) {
            goto immediate_exit;
        }
439
        if (s->granularity < bdi.cluster_size) {
440
            s->buf_size = MAX(s->buf_size, bdi.cluster_size);
441 442 443 444
            s->cow_bitmap = bitmap_new(length);
        }
    }

M
Max Reitz 已提交
445
    end = s->bdev_length / BDRV_SECTOR_SIZE;
446 447 448 449 450 451
    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
        ret = -ENOMEM;
        goto immediate_exit;
    }

452
    mirror_free_init(s);
P
Paolo Bonzini 已提交
453

454
    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
F
Fam Zheng 已提交
455
    if (!s->is_none_mode) {
P
Paolo Bonzini 已提交
456
        /* First part, loop on the sectors and initialize the dirty bitmap.  */
F
Fam Zheng 已提交
457
        BlockDriverState *base = s->base;
P
Paolo Bonzini 已提交
458
        for (sector_num = 0; sector_num < end; ) {
459 460 461
            /* Just to make sure we are not exceeding int limit. */
            int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
                                 end - sector_num);
462 463 464 465 466 467 468 469 470 471 472
            int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);

            if (now - last_pause_ns > SLICE_TIME) {
                last_pause_ns = now;
                block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
            }

            if (block_job_is_cancelled(&s->common)) {
                goto immediate_exit;
            }

473
            ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
P
Paolo Bonzini 已提交
474 475 476 477 478 479 480

            if (ret < 0) {
                goto immediate_exit;
            }

            assert(n > 0);
            if (ret == 1) {
481
                bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
P
Paolo Bonzini 已提交
482
            }
483
            sector_num += n;
P
Paolo Bonzini 已提交
484 485 486
        }
    }

487
    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
P
Paolo Bonzini 已提交
488
    for (;;) {
489
        uint64_t delay_ns = 0;
P
Paolo Bonzini 已提交
490 491 492
        int64_t cnt;
        bool should_complete;

493 494 495 496 497
        if (s->ret < 0) {
            ret = s->ret;
            goto immediate_exit;
        }

498
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
M
Max Reitz 已提交
499 500 501 502 503 504
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
         * s->sectors_in_flight is the number of sectors currently being
         * processed; together those are the current total operation length */
        s->common.len = s->common.offset +
                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
505 506

        /* Note that even when no rate limit is applied we need to yield
507
         * periodically with no pending I/O so that bdrv_drain_all() returns.
508 509 510
         * We do so every SLICE_TIME nanoseconds, or when there is an error,
         * or when the source is clean, whichever comes first.
         */
511
        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME &&
512
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
513 514 515
            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                (cnt == 0 && s->in_flight > 0)) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
K
Kevin Wolf 已提交
516
                s->waiting_for_io = true;
517
                qemu_coroutine_yield();
K
Kevin Wolf 已提交
518
                s->waiting_for_io = false;
519 520
                continue;
            } else if (cnt != 0) {
521
                delay_ns = mirror_iteration(s);
P
Paolo Bonzini 已提交
522 523 524 525
            }
        }

        should_complete = false;
526
        if (s->in_flight == 0 && cnt == 0) {
P
Paolo Bonzini 已提交
527 528 529
            trace_mirror_before_flush(s);
            ret = bdrv_flush(s->target);
            if (ret < 0) {
W
Wenchao Xia 已提交
530 531
                if (mirror_error_action(s, false, -ret) ==
                    BLOCK_ERROR_ACTION_REPORT) {
532 533 534 535 536 537 538 539 540
                    goto immediate_exit;
                }
            } else {
                /* We're out of the streaming phase.  From now on, if the job
                 * is cancelled we will actually complete all pending I/O and
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
                if (!s->synced) {
541
                    block_job_event_ready(&s->common);
542 543 544 545 546
                    s->synced = true;
                }

                should_complete = s->should_complete ||
                    block_job_is_cancelled(&s->common);
547
                cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
548
            }
P
Paolo Bonzini 已提交
549 550 551 552 553 554 555 556 557 558 559 560
        }

        if (cnt == 0 && should_complete) {
            /* The dirty bitmap is not updated while operations are pending.
             * If we're about to exit, wait for pending operations before
             * calling bdrv_get_dirty_count(bs), or we may exit while the
             * source has dirty data to copy!
             *
             * Note that I/O can be submitted by the guest while
             * mirror_populate runs.
             */
            trace_mirror_before_drain(s, cnt);
561
            bdrv_drain(bs);
562
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
563 564 565
        }

        ret = 0;
566
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
P
Paolo Bonzini 已提交
567
        if (!s->synced) {
568
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
569 570 571 572
            if (block_job_is_cancelled(&s->common)) {
                break;
            }
        } else if (!should_complete) {
573
            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
574
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
575 576 577 578 579 580 581 582
        } else if (cnt == 0) {
            /* The two disks are in sync.  Exit and report successful
             * completion.
             */
            assert(QLIST_EMPTY(&bs->tracked_requests));
            s->common.cancelled = false;
            break;
        }
583
        last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
P
Paolo Bonzini 已提交
584 585 586
    }

immediate_exit:
587 588 589 590 591 592 593 594 595 596
    if (s->in_flight > 0) {
        /* We get here only if something went wrong.  Either the job failed,
         * or it was cancelled prematurely so that we do not guarantee that
         * the target is a copy of the source.
         */
        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
        mirror_drain(s);
    }

    assert(s->in_flight == 0);
597
    qemu_vfree(s->buf);
598
    g_free(s->cow_bitmap);
599
    g_free(s->in_flight_bitmap);
F
Fam Zheng 已提交
600
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
601
    bdrv_iostatus_disable(s->target);
602 603 604 605

    data = g_malloc(sizeof(*data));
    data->ret = ret;
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
P
Paolo Bonzini 已提交
606 607 608 609 610 611 612
}

static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    if (speed < 0) {
613
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
P
Paolo Bonzini 已提交
614 615 616 617 618
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

619 620 621 622 623 624 625
static void mirror_iostatus_reset(BlockJob *job)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    bdrv_iostatus_reset(s->target);
}

P
Paolo Bonzini 已提交
626 627 628
static void mirror_complete(BlockJob *job, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
629
    Error *local_err = NULL;
P
Paolo Bonzini 已提交
630 631
    int ret;

632
    ret = bdrv_open_backing_file(s->target, NULL, &local_err);
P
Paolo Bonzini 已提交
633
    if (ret < 0) {
634
        error_propagate(errp, local_err);
P
Paolo Bonzini 已提交
635 636 637
        return;
    }
    if (!s->synced) {
638 639
        error_setg(errp, QERR_BLOCK_JOB_NOT_READY,
                   bdrv_get_device_name(job->bs));
P
Paolo Bonzini 已提交
640 641 642
        return;
    }

643 644
    /* check the target bs is not blocked and block all operations on it */
    if (s->replaces) {
645 646
        AioContext *replace_aio_context;

647
        s->to_replace = bdrv_find_node(s->replaces);
648
        if (!s->to_replace) {
649
            error_setg(errp, "Node name '%s' not found", s->replaces);
650 651 652
            return;
        }

653 654 655
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);

656 657 658 659
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
660 661

        aio_context_release(replace_aio_context);
662 663
    }

P
Paolo Bonzini 已提交
664
    s->should_complete = true;
F
Fam Zheng 已提交
665
    block_job_enter(&s->common);
P
Paolo Bonzini 已提交
666 667
}

668
static const BlockJobDriver mirror_job_driver = {
P
Paolo Bonzini 已提交
669
    .instance_size = sizeof(MirrorBlockJob),
F
Fam Zheng 已提交
670
    .job_type      = BLOCK_JOB_TYPE_MIRROR,
P
Paolo Bonzini 已提交
671
    .set_speed     = mirror_set_speed,
672
    .iostatus_reset= mirror_iostatus_reset,
P
Paolo Bonzini 已提交
673
    .complete      = mirror_complete,
P
Paolo Bonzini 已提交
674 675
};

F
Fam Zheng 已提交
676 677 678 679 680 681 682 683 684 685
static const BlockJobDriver commit_active_job_driver = {
    .instance_size = sizeof(MirrorBlockJob),
    .job_type      = BLOCK_JOB_TYPE_COMMIT,
    .set_speed     = mirror_set_speed,
    .iostatus_reset
                   = mirror_iostatus_reset,
    .complete      = mirror_complete,
};

static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
686
                             const char *replaces,
687
                             int64_t speed, uint32_t granularity,
688 689 690
                             int64_t buf_size,
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
691
                             bool unmap,
692
                             BlockCompletionFunc *cb,
693 694 695
                             void *opaque, Error **errp,
                             const BlockJobDriver *driver,
                             bool is_none_mode, BlockDriverState *base)
P
Paolo Bonzini 已提交
696 697 698
{
    MirrorBlockJob *s;

699
    if (granularity == 0) {
700
        granularity = bdrv_get_default_bitmap_granularity(target);
701 702 703 704
    }

    assert ((granularity & (granularity - 1)) == 0);

705 706 707
    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
        !bdrv_iostatus_is_enabled(bs)) {
708
        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
709 710 711
        return;
    }

W
Wen Congyang 已提交
712 713 714 715 716 717 718 719
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
    }

    if (buf_size == 0) {
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }
F
Fam Zheng 已提交
720

F
Fam Zheng 已提交
721
    s = block_job_create(driver, bs, speed, cb, opaque, errp);
P
Paolo Bonzini 已提交
722 723 724 725
    if (!s) {
        return;
    }

726
    s->replaces = g_strdup(replaces);
727 728
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
P
Paolo Bonzini 已提交
729
    s->target = target;
F
Fam Zheng 已提交
730
    s->is_none_mode = is_none_mode;
F
Fam Zheng 已提交
731
    s->base = base;
732
    s->granularity = granularity;
W
Wen Congyang 已提交
733
    s->buf_size = ROUND_UP(buf_size, granularity);
734
    s->unmap = unmap;
735

736
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
737
    if (!s->dirty_bitmap) {
738 739
        g_free(s->replaces);
        block_job_release(bs);
740 741
        return;
    }
P
Paolo Bonzini 已提交
742
    bdrv_set_enable_write_cache(s->target, true);
743 744
    bdrv_set_on_error(s->target, on_target_error, on_target_error);
    bdrv_iostatus_enable(s->target);
P
Paolo Bonzini 已提交
745 746 747 748
    s->common.co = qemu_coroutine_create(mirror_run);
    trace_mirror_start(bs, s, s->common.co, opaque);
    qemu_coroutine_enter(s->common.co, s);
}
F
Fam Zheng 已提交
749 750

void mirror_start(BlockDriverState *bs, BlockDriverState *target,
751
                  const char *replaces,
752
                  int64_t speed, uint32_t granularity, int64_t buf_size,
F
Fam Zheng 已提交
753 754
                  MirrorSyncMode mode, BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
755
                  bool unmap,
756
                  BlockCompletionFunc *cb,
F
Fam Zheng 已提交
757 758 759 760 761
                  void *opaque, Error **errp)
{
    bool is_none_mode;
    BlockDriverState *base;

762 763
    if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        error_setg(errp, "Sync mode 'incremental' not supported");
764 765
        return;
    }
F
Fam Zheng 已提交
766 767
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
    base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
768 769
    mirror_start_job(bs, target, replaces,
                     speed, granularity, buf_size,
770
                     on_source_error, on_target_error, unmap, cb, opaque, errp,
F
Fam Zheng 已提交
771 772 773 774 775 776
                     &mirror_job_driver, is_none_mode, base);
}

void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
                         int64_t speed,
                         BlockdevOnError on_error,
777
                         BlockCompletionFunc *cb,
F
Fam Zheng 已提交
778 779
                         void *opaque, Error **errp)
{
780 781
    int64_t length, base_length;
    int orig_base_flags;
782
    int ret;
783
    Error *local_err = NULL;
784 785 786

    orig_base_flags = bdrv_get_flags(base);

F
Fam Zheng 已提交
787 788 789
    if (bdrv_reopen(base, bs->open_flags, errp)) {
        return;
    }
790 791 792

    length = bdrv_getlength(bs);
    if (length < 0) {
793 794
        error_setg_errno(errp, -length,
                         "Unable to determine length of %s", bs->filename);
795 796 797 798 799
        goto error_restore_flags;
    }

    base_length = bdrv_getlength(base);
    if (base_length < 0) {
800 801
        error_setg_errno(errp, -base_length,
                         "Unable to determine length of %s", base->filename);
802 803 804 805
        goto error_restore_flags;
    }

    if (length > base_length) {
806 807 808 809
        ret = bdrv_truncate(base, length);
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                            "Top image %s is larger than base image %s, and "
810 811 812 813 814 815
                             "resize of base image failed",
                             bs->filename, base->filename);
            goto error_restore_flags;
        }
    }

F
Fam Zheng 已提交
816
    bdrv_ref(base);
817
    mirror_start_job(bs, base, NULL, speed, 0, 0,
818
                     on_error, on_error, false, cb, opaque, &local_err,
F
Fam Zheng 已提交
819
                     &commit_active_job_driver, false, base);
820
    if (local_err) {
821
        error_propagate(errp, local_err);
822 823 824 825 826 827 828 829 830 831
        goto error_restore_flags;
    }

    return;

error_restore_flags:
    /* ignore error and errp for bdrv_reopen, because we want to propagate
     * the original error */
    bdrv_reopen(base, orig_base_flags, NULL);
    return;
F
Fam Zheng 已提交
832
}