mirror.c 25.6 KB
Newer Older
P
Paolo Bonzini 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * Image mirroring
 *
 * Copyright Red Hat, Inc. 2012
 *
 * Authors:
 *  Paolo Bonzini  <pbonzini@redhat.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

#include "trace.h"
15 16
#include "block/blockjob.h"
#include "block/block_int.h"
17
#include "qapi/qmp/qerror.h"
P
Paolo Bonzini 已提交
18
#include "qemu/ratelimit.h"
19
#include "qemu/bitmap.h"
P
Paolo Bonzini 已提交
20

21 22 23 24 25 26 27 28 29
#define SLICE_TIME    100000000ULL /* ns */
#define MAX_IN_FLIGHT 16

/* The mirroring buffer is a list of granularity-sized chunks.
 * Free chunks are organized in a list.
 */
typedef struct MirrorBuffer {
    QSIMPLEQ_ENTRY(MirrorBuffer) next;
} MirrorBuffer;
P
Paolo Bonzini 已提交
30 31 32 33 34

typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
    BlockDriverState *target;
F
Fam Zheng 已提交
35
    BlockDriverState *base;
36 37 38 39 40 41
    /* The name of the graph node to replace */
    char *replaces;
    /* The BDS to replace */
    BlockDriverState *to_replace;
    /* Used to block operations on the drive-mirror-replace target */
    Error *replace_blocker;
F
Fam Zheng 已提交
42
    bool is_none_mode;
43
    BlockdevOnError on_source_error, on_target_error;
P
Paolo Bonzini 已提交
44 45
    bool synced;
    bool should_complete;
P
Paolo Bonzini 已提交
46
    int64_t sector_num;
47
    int64_t granularity;
48
    size_t buf_size;
M
Max Reitz 已提交
49
    int64_t bdev_length;
50
    unsigned long *cow_bitmap;
F
Fam Zheng 已提交
51
    BdrvDirtyBitmap *dirty_bitmap;
52
    HBitmapIter hbi;
P
Paolo Bonzini 已提交
53
    uint8_t *buf;
54 55
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;
56

57
    unsigned long *in_flight_bitmap;
58
    int in_flight;
M
Max Reitz 已提交
59
    int sectors_in_flight;
60
    int ret;
61
    bool unmap;
P
Paolo Bonzini 已提交
62 63
} MirrorBlockJob;

64 65 66 67 68 69 70
typedef struct MirrorOp {
    MirrorBlockJob *s;
    QEMUIOVector qiov;
    int64_t sector_num;
    int nb_sectors;
} MirrorOp;

71 72 73 74 75 76 77 78 79 80 81 82 83
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                            int error)
{
    s->synced = false;
    if (read) {
        return block_job_error_action(&s->common, s->common.bs,
                                      s->on_source_error, true, error);
    } else {
        return block_job_error_action(&s->common, s->target,
                                      s->on_target_error, false, error);
    }
}

84 85 86
static void mirror_iteration_done(MirrorOp *op, int ret)
{
    MirrorBlockJob *s = op->s;
87
    struct iovec *iov;
88
    int64_t chunk_num;
89
    int i, nb_chunks, sectors_per_chunk;
90 91 92 93

    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
M
Max Reitz 已提交
94
    s->sectors_in_flight -= op->nb_sectors;
95 96 97 98 99 100 101
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
        s->buf_free_count++;
    }

102 103 104
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    chunk_num = op->sector_num / sectors_per_chunk;
    nb_chunks = op->nb_sectors / sectors_per_chunk;
105
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
M
Max Reitz 已提交
106 107 108 109 110
    if (ret >= 0) {
        if (s->cow_bitmap) {
            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
        }
        s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
111 112
    }

Z
Zhang Min 已提交
113
    qemu_iovec_destroy(&op->qiov);
114
    g_slice_free(MirrorOp, op);
115 116 117 118 119 120 121 122

    /* Enter coroutine when it is not sleeping.  The coroutine sleeps to
     * rate-limit itself.  The coroutine will eventually resume since there is
     * a sleep timeout so don't wake it early.
     */
    if (s->common.busy) {
        qemu_coroutine_enter(s->common.co, NULL);
    }
123 124 125 126 127 128 129 130 131
}

static void mirror_write_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
    if (ret < 0) {
        BlockErrorAction action;

132
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
133
        action = mirror_error_action(s, false, -ret);
W
Wenchao Xia 已提交
134
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
135 136 137 138 139 140 141 142 143 144 145 146 147
            s->ret = ret;
        }
    }
    mirror_iteration_done(op, ret);
}

static void mirror_read_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
    if (ret < 0) {
        BlockErrorAction action;

148
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
149
        action = mirror_error_action(s, true, -ret);
W
Wenchao Xia 已提交
150
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
151 152 153 154 155 156 157 158 159 160
            s->ret = ret;
        }

        mirror_iteration_done(op, ret);
        return;
    }
    bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors,
                    mirror_write_complete, op);
}

161
static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
P
Paolo Bonzini 已提交
162 163
{
    BlockDriverState *source = s->common.bs;
164
    int nb_sectors, sectors_per_chunk, nb_chunks;
165
    int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
166
    uint64_t delay_ns = 0;
167
    MirrorOp *op;
P
Paolo Bonzini 已提交
168

169 170
    s->sector_num = hbitmap_iter_next(&s->hbi);
    if (s->sector_num < 0) {
171
        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
172
        s->sector_num = hbitmap_iter_next(&s->hbi);
173
        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
174 175 176
        assert(s->sector_num >= 0);
    }

177
    hbitmap_next_sector = s->sector_num;
178 179
    sector_num = s->sector_num;
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
M
Max Reitz 已提交
180
    end = s->bdev_length / BDRV_SECTOR_SIZE;
181

182 183
    /* Extend the QEMUIOVector to include all adjacent blocks that will
     * be copied in this operation.
184
     *
185 186 187 188 189 190 191 192 193
     * We have to do this if we have no backing file yet in the destination,
     * and the cluster size is very large.  Then we need to do COW ourselves.
     * The first time a cluster is copied, copy it entirely.  Note that,
     * because both the granularity and the cluster size are powers of two,
     * the number of sectors to copy cannot exceed one cluster.
     *
     * We also want to extend the QEMUIOVector to include more adjacent
     * dirty blocks if possible, to limit the number of I/O operations and
     * run efficiently even with a small granularity.
194
     */
195 196 197 198
    nb_chunks = 0;
    nb_sectors = 0;
    next_sector = sector_num;
    next_chunk = sector_num / sectors_per_chunk;
199 200

    /* Wait for I/O to this cluster (from a previous iteration) to be done.  */
201
    while (test_bit(next_chunk, s->in_flight_bitmap)) {
202 203
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
        qemu_coroutine_yield();
204 205
    }

206 207 208
    do {
        int added_sectors, added_chunks;

F
Fam Zheng 已提交
209
        if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) ||
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
            test_bit(next_chunk, s->in_flight_bitmap)) {
            assert(nb_sectors > 0);
            break;
        }

        added_sectors = sectors_per_chunk;
        if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) {
            bdrv_round_to_clusters(s->target,
                                   next_sector, added_sectors,
                                   &next_sector, &added_sectors);

            /* On the first iteration, the rounding may make us copy
             * sectors before the first dirty one.
             */
            if (next_sector < sector_num) {
                assert(nb_sectors == 0);
                sector_num = next_sector;
                next_chunk = next_sector / sectors_per_chunk;
            }
        }

        added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors));
        added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk;

        /* When doing COW, it may happen that there is not enough space for
         * a full cluster.  Wait if that is the case.
         */
        while (nb_chunks == 0 && s->buf_free_count < added_chunks) {
            trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
            qemu_coroutine_yield();
        }
        if (s->buf_free_count < nb_chunks + added_chunks) {
            trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight);
            break;
        }

        /* We have enough free space to copy these sectors.  */
        bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks);
248

249 250 251 252
        nb_sectors += added_sectors;
        nb_chunks += added_chunks;
        next_sector += added_sectors;
        next_chunk += added_chunks;
253 254 255 256
        if (!s->synced && s->common.speed) {
            delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors);
        }
    } while (delay_ns == 0 && next_sector < end);
257 258 259 260 261 262

    /* Allocate a MirrorOp that is used as an AIO callback.  */
    op = g_slice_new(MirrorOp);
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;
263 264 265 266 267 268 269 270

    /* Now make a QEMUIOVector taking enough granularity-sized chunks
     * from s->buf_free.
     */
    qemu_iovec_init(&op->qiov, nb_chunks);
    next_sector = sector_num;
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
271 272
        size_t remaining = (nb_sectors * BDRV_SECTOR_SIZE) - op->qiov.size;

273 274
        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
275
        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
276 277 278 279

        /* Advance the HBitmapIter in parallel, so that we do not examine
         * the same sector twice.
         */
F
Fam Zheng 已提交
280 281
        if (next_sector > hbitmap_next_sector
            && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
282 283 284 285 286
            hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
        }

        next_sector += sectors_per_chunk;
    }
287

288
    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors);
P
Paolo Bonzini 已提交
289 290

    /* Copy the dirty cluster.  */
291
    s->in_flight++;
M
Max Reitz 已提交
292
    s->sectors_in_flight += nb_sectors;
293
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
294 295
    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
                   mirror_read_complete, op);
296
    return delay_ns;
297
}
298

299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
static void mirror_free_init(MirrorBlockJob *s)
{
    int granularity = s->granularity;
    size_t buf_size = s->buf_size;
    uint8_t *buf = s->buf;

    assert(s->buf_free_count == 0);
    QSIMPLEQ_INIT(&s->buf_free);
    while (buf_size != 0) {
        MirrorBuffer *cur = (MirrorBuffer *)buf;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
        s->buf_free_count++;
        buf_size -= granularity;
        buf += granularity;
    }
}

316 317 318 319 320
static void mirror_drain(MirrorBlockJob *s)
{
    while (s->in_flight > 0) {
        qemu_coroutine_yield();
    }
P
Paolo Bonzini 已提交
321 322
}

323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
typedef struct {
    int ret;
} MirrorExitData;

static void mirror_exit(BlockJob *job, void *opaque)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);
    }

    if (s->should_complete && data->ret == 0) {
        BlockDriverState *to_replace = s->common.bs;
        if (s->to_replace) {
            to_replace = s->to_replace;
        }
        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
        }
        bdrv_swap(s->target, to_replace);
        if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) {
            /* drop the bs loop chain formed by the swap: break the loop then
             * trigger the unref from the top one */
            BlockDriverState *p = s->base->backing_hd;
            bdrv_set_backing_hd(s->base, NULL);
            bdrv_unref(p);
        }
    }
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
        bdrv_unref(s->to_replace);
    }
    if (replace_aio_context) {
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
    bdrv_unref(s->target);
    block_job_completed(&s->common, data->ret);
    g_free(data);
}

P
Paolo Bonzini 已提交
369 370 371
static void coroutine_fn mirror_run(void *opaque)
{
    MirrorBlockJob *s = opaque;
372
    MirrorExitData *data;
P
Paolo Bonzini 已提交
373
    BlockDriverState *bs = s->common.bs;
374
    int64_t sector_num, end, sectors_per_chunk, length;
375
    uint64_t last_pause_ns;
376
    BlockDriverInfo bdi;
377 378
    char backing_filename[2]; /* we only need 2 characters because we are only
                                 checking for a NULL string */
P
Paolo Bonzini 已提交
379 380 381 382 383 384 385
    int ret = 0;
    int n;

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
    }

M
Max Reitz 已提交
386 387 388
    s->bdev_length = bdrv_getlength(bs);
    if (s->bdev_length < 0) {
        ret = s->bdev_length;
389
        goto immediate_exit;
M
Max Reitz 已提交
390
    } else if (s->bdev_length == 0) {
391 392 393 394 395 396 397 398
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
        while (!block_job_is_cancelled(&s->common) && !s->should_complete) {
            block_job_yield(&s->common);
        }
        s->common.cancelled = false;
        goto immediate_exit;
P
Paolo Bonzini 已提交
399 400
    }

M
Max Reitz 已提交
401
    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
402 403
    s->in_flight_bitmap = bitmap_new(length);

404 405 406 407 408 409 410
    /* If we have no backing file yet in the destination, we cannot let
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
    bdrv_get_backing_filename(s->target, backing_filename,
                              sizeof(backing_filename));
    if (backing_filename[0] && !s->target->backing_hd) {
411 412 413 414
        ret = bdrv_get_info(s->target, &bdi);
        if (ret < 0) {
            goto immediate_exit;
        }
415
        if (s->granularity < bdi.cluster_size) {
416
            s->buf_size = MAX(s->buf_size, bdi.cluster_size);
417 418 419 420
            s->cow_bitmap = bitmap_new(length);
        }
    }

M
Max Reitz 已提交
421
    end = s->bdev_length / BDRV_SECTOR_SIZE;
422 423 424 425 426 427
    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
        ret = -ENOMEM;
        goto immediate_exit;
    }

428
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
429
    mirror_free_init(s);
P
Paolo Bonzini 已提交
430

F
Fam Zheng 已提交
431
    if (!s->is_none_mode) {
P
Paolo Bonzini 已提交
432
        /* First part, loop on the sectors and initialize the dirty bitmap.  */
F
Fam Zheng 已提交
433
        BlockDriverState *base = s->base;
P
Paolo Bonzini 已提交
434
        for (sector_num = 0; sector_num < end; ) {
435
            int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1;
436 437
            ret = bdrv_is_allocated_above(bs, base,
                                          sector_num, next - sector_num, &n);
P
Paolo Bonzini 已提交
438 439 440 441 442 443 444

            if (ret < 0) {
                goto immediate_exit;
            }

            assert(n > 0);
            if (ret == 1) {
445
                bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
P
Paolo Bonzini 已提交
446 447 448 449 450 451 452
                sector_num = next;
            } else {
                sector_num += n;
            }
        }
    }

453
    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
454
    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
P
Paolo Bonzini 已提交
455
    for (;;) {
456
        uint64_t delay_ns = 0;
P
Paolo Bonzini 已提交
457 458 459
        int64_t cnt;
        bool should_complete;

460 461 462 463 464
        if (s->ret < 0) {
            ret = s->ret;
            goto immediate_exit;
        }

465
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
M
Max Reitz 已提交
466 467 468 469 470 471
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
         * s->sectors_in_flight is the number of sectors currently being
         * processed; together those are the current total operation length */
        s->common.len = s->common.offset +
                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
472 473

        /* Note that even when no rate limit is applied we need to yield
474
         * periodically with no pending I/O so that bdrv_drain_all() returns.
475 476 477
         * We do so every SLICE_TIME nanoseconds, or when there is an error,
         * or when the source is clean, whichever comes first.
         */
478
        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME &&
479
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
480 481 482
            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                (cnt == 0 && s->in_flight > 0)) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
483 484 485
                qemu_coroutine_yield();
                continue;
            } else if (cnt != 0) {
486
                delay_ns = mirror_iteration(s);
P
Paolo Bonzini 已提交
487 488 489 490
            }
        }

        should_complete = false;
491
        if (s->in_flight == 0 && cnt == 0) {
P
Paolo Bonzini 已提交
492 493 494
            trace_mirror_before_flush(s);
            ret = bdrv_flush(s->target);
            if (ret < 0) {
W
Wenchao Xia 已提交
495 496
                if (mirror_error_action(s, false, -ret) ==
                    BLOCK_ERROR_ACTION_REPORT) {
497 498 499 500 501 502 503 504 505
                    goto immediate_exit;
                }
            } else {
                /* We're out of the streaming phase.  From now on, if the job
                 * is cancelled we will actually complete all pending I/O and
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
                if (!s->synced) {
506
                    block_job_event_ready(&s->common);
507 508 509 510 511
                    s->synced = true;
                }

                should_complete = s->should_complete ||
                    block_job_is_cancelled(&s->common);
512
                cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
513
            }
P
Paolo Bonzini 已提交
514 515 516 517 518 519 520 521 522 523 524 525
        }

        if (cnt == 0 && should_complete) {
            /* The dirty bitmap is not updated while operations are pending.
             * If we're about to exit, wait for pending operations before
             * calling bdrv_get_dirty_count(bs), or we may exit while the
             * source has dirty data to copy!
             *
             * Note that I/O can be submitted by the guest while
             * mirror_populate runs.
             */
            trace_mirror_before_drain(s, cnt);
526
            bdrv_drain(bs);
527
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
528 529 530
        }

        ret = 0;
531
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
P
Paolo Bonzini 已提交
532
        if (!s->synced) {
533
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
534 535 536 537
            if (block_job_is_cancelled(&s->common)) {
                break;
            }
        } else if (!should_complete) {
538
            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
539
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
540 541 542 543 544 545 546 547
        } else if (cnt == 0) {
            /* The two disks are in sync.  Exit and report successful
             * completion.
             */
            assert(QLIST_EMPTY(&bs->tracked_requests));
            s->common.cancelled = false;
            break;
        }
548
        last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
P
Paolo Bonzini 已提交
549 550 551
    }

immediate_exit:
552 553 554 555 556 557 558 559 560 561
    if (s->in_flight > 0) {
        /* We get here only if something went wrong.  Either the job failed,
         * or it was cancelled prematurely so that we do not guarantee that
         * the target is a copy of the source.
         */
        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
        mirror_drain(s);
    }

    assert(s->in_flight == 0);
562
    qemu_vfree(s->buf);
563
    g_free(s->cow_bitmap);
564
    g_free(s->in_flight_bitmap);
F
Fam Zheng 已提交
565
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
566
    bdrv_iostatus_disable(s->target);
567 568 569 570

    data = g_malloc(sizeof(*data));
    data->ret = ret;
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
P
Paolo Bonzini 已提交
571 572 573 574 575 576 577
}

static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    if (speed < 0) {
578
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
P
Paolo Bonzini 已提交
579 580 581 582 583
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

584 585 586 587 588 589 590
static void mirror_iostatus_reset(BlockJob *job)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    bdrv_iostatus_reset(s->target);
}

P
Paolo Bonzini 已提交
591 592 593
static void mirror_complete(BlockJob *job, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
594
    Error *local_err = NULL;
P
Paolo Bonzini 已提交
595 596
    int ret;

597
    ret = bdrv_open_backing_file(s->target, NULL, &local_err);
P
Paolo Bonzini 已提交
598
    if (ret < 0) {
599
        error_propagate(errp, local_err);
P
Paolo Bonzini 已提交
600 601 602
        return;
    }
    if (!s->synced) {
603 604
        error_setg(errp, QERR_BLOCK_JOB_NOT_READY,
                   bdrv_get_device_name(job->bs));
P
Paolo Bonzini 已提交
605 606 607
        return;
    }

608 609
    /* check the target bs is not blocked and block all operations on it */
    if (s->replaces) {
610 611
        AioContext *replace_aio_context;

612 613 614 615 616 617
        s->to_replace = check_to_replace_node(s->replaces, &local_err);
        if (!s->to_replace) {
            error_propagate(errp, local_err);
            return;
        }

618 619 620
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);

621 622 623 624
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
625 626

        aio_context_release(replace_aio_context);
627 628
    }

P
Paolo Bonzini 已提交
629
    s->should_complete = true;
F
Fam Zheng 已提交
630
    block_job_enter(&s->common);
P
Paolo Bonzini 已提交
631 632
}

633
static const BlockJobDriver mirror_job_driver = {
P
Paolo Bonzini 已提交
634
    .instance_size = sizeof(MirrorBlockJob),
F
Fam Zheng 已提交
635
    .job_type      = BLOCK_JOB_TYPE_MIRROR,
P
Paolo Bonzini 已提交
636
    .set_speed     = mirror_set_speed,
637
    .iostatus_reset= mirror_iostatus_reset,
P
Paolo Bonzini 已提交
638
    .complete      = mirror_complete,
P
Paolo Bonzini 已提交
639 640
};

F
Fam Zheng 已提交
641 642 643 644 645 646 647 648 649 650
static const BlockJobDriver commit_active_job_driver = {
    .instance_size = sizeof(MirrorBlockJob),
    .job_type      = BLOCK_JOB_TYPE_COMMIT,
    .set_speed     = mirror_set_speed,
    .iostatus_reset
                   = mirror_iostatus_reset,
    .complete      = mirror_complete,
};

static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
651
                             const char *replaces,
652
                             int64_t speed, uint32_t granularity,
653 654 655
                             int64_t buf_size,
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
656
                             bool unmap,
657
                             BlockCompletionFunc *cb,
658 659 660
                             void *opaque, Error **errp,
                             const BlockJobDriver *driver,
                             bool is_none_mode, BlockDriverState *base)
P
Paolo Bonzini 已提交
661 662 663
{
    MirrorBlockJob *s;

664
    if (granularity == 0) {
665
        granularity = bdrv_get_default_bitmap_granularity(target);
666 667 668 669
    }

    assert ((granularity & (granularity - 1)) == 0);

670 671 672
    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
        !bdrv_iostatus_is_enabled(bs)) {
673
        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
674 675 676
        return;
    }

F
Fam Zheng 已提交
677

F
Fam Zheng 已提交
678
    s = block_job_create(driver, bs, speed, cb, opaque, errp);
P
Paolo Bonzini 已提交
679 680 681 682
    if (!s) {
        return;
    }

683
    s->replaces = g_strdup(replaces);
684 685
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
P
Paolo Bonzini 已提交
686
    s->target = target;
F
Fam Zheng 已提交
687
    s->is_none_mode = is_none_mode;
F
Fam Zheng 已提交
688
    s->base = base;
689
    s->granularity = granularity;
690
    s->buf_size = MAX(buf_size, granularity);
691
    s->unmap = unmap;
692

693
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
694 695 696
    if (!s->dirty_bitmap) {
        return;
    }
P
Paolo Bonzini 已提交
697
    bdrv_set_enable_write_cache(s->target, true);
698 699
    bdrv_set_on_error(s->target, on_target_error, on_target_error);
    bdrv_iostatus_enable(s->target);
P
Paolo Bonzini 已提交
700 701 702 703
    s->common.co = qemu_coroutine_create(mirror_run);
    trace_mirror_start(bs, s, s->common.co, opaque);
    qemu_coroutine_enter(s->common.co, s);
}
F
Fam Zheng 已提交
704 705

void mirror_start(BlockDriverState *bs, BlockDriverState *target,
706
                  const char *replaces,
707
                  int64_t speed, uint32_t granularity, int64_t buf_size,
F
Fam Zheng 已提交
708 709
                  MirrorSyncMode mode, BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
710
                  bool unmap,
711
                  BlockCompletionFunc *cb,
F
Fam Zheng 已提交
712 713 714 715 716
                  void *opaque, Error **errp)
{
    bool is_none_mode;
    BlockDriverState *base;

717 718
    if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        error_setg(errp, "Sync mode 'incremental' not supported");
719 720
        return;
    }
F
Fam Zheng 已提交
721 722
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
    base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
723 724
    mirror_start_job(bs, target, replaces,
                     speed, granularity, buf_size,
725
                     on_source_error, on_target_error, unmap, cb, opaque, errp,
F
Fam Zheng 已提交
726 727 728 729 730 731
                     &mirror_job_driver, is_none_mode, base);
}

void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
                         int64_t speed,
                         BlockdevOnError on_error,
732
                         BlockCompletionFunc *cb,
F
Fam Zheng 已提交
733 734
                         void *opaque, Error **errp)
{
735 736
    int64_t length, base_length;
    int orig_base_flags;
737
    int ret;
738
    Error *local_err = NULL;
739 740 741

    orig_base_flags = bdrv_get_flags(base);

F
Fam Zheng 已提交
742 743 744
    if (bdrv_reopen(base, bs->open_flags, errp)) {
        return;
    }
745 746 747

    length = bdrv_getlength(bs);
    if (length < 0) {
748 749
        error_setg_errno(errp, -length,
                         "Unable to determine length of %s", bs->filename);
750 751 752 753 754
        goto error_restore_flags;
    }

    base_length = bdrv_getlength(base);
    if (base_length < 0) {
755 756
        error_setg_errno(errp, -base_length,
                         "Unable to determine length of %s", base->filename);
757 758 759 760
        goto error_restore_flags;
    }

    if (length > base_length) {
761 762 763 764
        ret = bdrv_truncate(base, length);
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                            "Top image %s is larger than base image %s, and "
765 766 767 768 769 770
                             "resize of base image failed",
                             bs->filename, base->filename);
            goto error_restore_flags;
        }
    }

F
Fam Zheng 已提交
771
    bdrv_ref(base);
772
    mirror_start_job(bs, base, NULL, speed, 0, 0,
773
                     on_error, on_error, false, cb, opaque, &local_err,
F
Fam Zheng 已提交
774
                     &commit_active_job_driver, false, base);
775
    if (local_err) {
776
        error_propagate(errp, local_err);
777 778 779 780 781 782 783 784 785 786
        goto error_restore_flags;
    }

    return;

error_restore_flags:
    /* ignore error and errp for bdrv_reopen, because we want to propagate
     * the original error */
    bdrv_reopen(base, orig_base_flags, NULL);
    return;
F
Fam Zheng 已提交
787
}