mirror.c 33.1 KB
Newer Older
P
Paolo Bonzini 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Image mirroring
 *
 * Copyright Red Hat, Inc. 2012
 *
 * Authors:
 *  Paolo Bonzini  <pbonzini@redhat.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
14
#include "qemu/osdep.h"
P
Paolo Bonzini 已提交
15
#include "trace.h"
16 17
#include "block/blockjob.h"
#include "block/block_int.h"
18
#include "sysemu/block-backend.h"
19
#include "qapi/error.h"
20
#include "qapi/qmp/qerror.h"
P
Paolo Bonzini 已提交
21
#include "qemu/ratelimit.h"
22
#include "qemu/bitmap.h"
P
Paolo Bonzini 已提交
23

24 25
#define SLICE_TIME    100000000ULL /* ns */
#define MAX_IN_FLIGHT 16
W
Wen Congyang 已提交
26
#define DEFAULT_MIRROR_BUF_SIZE   (10 << 20)
27 28 29 30 31 32 33

/* The mirroring buffer is a list of granularity-sized chunks.
 * Free chunks are organized in a list.
 */
typedef struct MirrorBuffer {
    QSIMPLEQ_ENTRY(MirrorBuffer) next;
} MirrorBuffer;
P
Paolo Bonzini 已提交
34 35 36 37

typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
K
Kevin Wolf 已提交
38
    BlockBackend *target;
F
Fam Zheng 已提交
39
    BlockDriverState *base;
40 41 42 43 44 45
    /* The name of the graph node to replace */
    char *replaces;
    /* The BDS to replace */
    BlockDriverState *to_replace;
    /* Used to block operations on the drive-mirror-replace target */
    Error *replace_blocker;
F
Fam Zheng 已提交
46
    bool is_none_mode;
M
Max Reitz 已提交
47
    BlockMirrorBackingMode backing_mode;
48
    BlockdevOnError on_source_error, on_target_error;
P
Paolo Bonzini 已提交
49 50
    bool synced;
    bool should_complete;
51
    int64_t granularity;
52
    size_t buf_size;
M
Max Reitz 已提交
53
    int64_t bdev_length;
54
    unsigned long *cow_bitmap;
F
Fam Zheng 已提交
55
    BdrvDirtyBitmap *dirty_bitmap;
56
    HBitmapIter hbi;
P
Paolo Bonzini 已提交
57
    uint8_t *buf;
58 59
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;
60

61
    uint64_t last_pause_ns;
62
    unsigned long *in_flight_bitmap;
63
    int in_flight;
64
    int64_t sectors_in_flight;
65
    int ret;
66
    bool unmap;
K
Kevin Wolf 已提交
67
    bool waiting_for_io;
F
Fam Zheng 已提交
68 69
    int target_cluster_sectors;
    int max_iov;
P
Paolo Bonzini 已提交
70 71
} MirrorBlockJob;

72 73 74 75 76 77 78
typedef struct MirrorOp {
    MirrorBlockJob *s;
    QEMUIOVector qiov;
    int64_t sector_num;
    int nb_sectors;
} MirrorOp;

79 80 81 82 83
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                            int error)
{
    s->synced = false;
    if (read) {
84 85
        return block_job_error_action(&s->common, s->on_source_error,
                                      true, error);
86
    } else {
87 88
        return block_job_error_action(&s->common, s->on_target_error,
                                      false, error);
89 90 91
    }
}

92 93 94
static void mirror_iteration_done(MirrorOp *op, int ret)
{
    MirrorBlockJob *s = op->s;
95
    struct iovec *iov;
96
    int64_t chunk_num;
97
    int i, nb_chunks, sectors_per_chunk;
98 99 100 101

    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
M
Max Reitz 已提交
102
    s->sectors_in_flight -= op->nb_sectors;
103 104 105 106 107 108 109
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
        s->buf_free_count++;
    }

110 111
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    chunk_num = op->sector_num / sectors_per_chunk;
112
    nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk);
113
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
M
Max Reitz 已提交
114 115 116 117 118
    if (ret >= 0) {
        if (s->cow_bitmap) {
            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
        }
        s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
119 120
    }

Z
Zhang Min 已提交
121
    qemu_iovec_destroy(&op->qiov);
122
    g_free(op);
123

K
Kevin Wolf 已提交
124
    if (s->waiting_for_io) {
125
        qemu_coroutine_enter(s->common.co);
126
    }
127 128 129 130 131 132 133 134 135
}

static void mirror_write_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
    if (ret < 0) {
        BlockErrorAction action;

136
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
137
        action = mirror_error_action(s, false, -ret);
W
Wenchao Xia 已提交
138
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
139 140 141 142 143 144 145 146 147 148 149 150 151
            s->ret = ret;
        }
    }
    mirror_iteration_done(op, ret);
}

static void mirror_read_complete(void *opaque, int ret)
{
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
    if (ret < 0) {
        BlockErrorAction action;

152
        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
153
        action = mirror_error_action(s, true, -ret);
W
Wenchao Xia 已提交
154
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
155 156 157 158 159 160
            s->ret = ret;
        }

        mirror_iteration_done(op, ret);
        return;
    }
K
Kevin Wolf 已提交
161
    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
162
                    0, mirror_write_complete, op);
163 164
}

165 166 167 168 169 170 171 172
static inline void mirror_clip_sectors(MirrorBlockJob *s,
                                       int64_t sector_num,
                                       int *nb_sectors)
{
    *nb_sectors = MIN(*nb_sectors,
                      s->bdev_length / BDRV_SECTOR_SIZE - sector_num);
}

F
Fam Zheng 已提交
173 174 175 176 177
/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
 * return the offset of the adjusted tail sector against original. */
static int mirror_cow_align(MirrorBlockJob *s,
                            int64_t *sector_num,
                            int *nb_sectors)
P
Paolo Bonzini 已提交
178
{
F
Fam Zheng 已提交
179 180 181 182 183 184 185 186 187 188 189
    bool need_cow;
    int ret = 0;
    int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS;
    int64_t align_sector_num = *sector_num;
    int align_nb_sectors = *nb_sectors;
    int max_sectors = chunk_sectors * s->max_iov;

    need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap);
    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                          s->cow_bitmap);
    if (need_cow) {
190 191 192
        bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
                                       *nb_sectors, &align_sector_num,
                                       &align_nb_sectors);
F
Fam Zheng 已提交
193
    }
194

F
Fam Zheng 已提交
195 196 197 198 199 200
    if (align_nb_sectors > max_sectors) {
        align_nb_sectors = max_sectors;
        if (need_cow) {
            align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors,
                                               s->target_cluster_sectors);
        }
201
    }
202 203 204
    /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but
     * that doesn't matter because it's already the end of source image. */
    mirror_clip_sectors(s, align_sector_num, &align_nb_sectors);
205

F
Fam Zheng 已提交
206 207 208 209 210 211 212
    ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
    *sector_num = align_sector_num;
    *nb_sectors = align_nb_sectors;
    assert(ret >= 0);
    return ret;
}

F
Fam Zheng 已提交
213 214 215 216 217 218 219 220
static inline void mirror_wait_for_io(MirrorBlockJob *s)
{
    assert(!s->waiting_for_io);
    s->waiting_for_io = true;
    qemu_coroutine_yield();
    s->waiting_for_io = false;
}

F
Fam Zheng 已提交
221
/* Submit async read while handling COW.
222 223 224
 * Returns: The number of sectors copied after and including sector_num,
 *          excluding any sectors copied prior to sector_num due to alignment.
 *          This will be nb_sectors if no alignment is necessary, or
F
Fam Zheng 已提交
225 226 227 228 229 230
 *          (new_end - sector_num) if tail is rounded up or down due to
 *          alignment or buffer limit.
 */
static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                          int nb_sectors)
{
K
Kevin Wolf 已提交
231
    BlockBackend *source = s->common.blk;
F
Fam Zheng 已提交
232
    int sectors_per_chunk, nb_chunks;
233
    int ret;
F
Fam Zheng 已提交
234
    MirrorOp *op;
235
    int max_sectors;
F
Fam Zheng 已提交
236

237
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
238
    max_sectors = sectors_per_chunk * s->max_iov;
239

F
Fam Zheng 已提交
240 241
    /* We can only handle as much as buf_size at a time. */
    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
242
    nb_sectors = MIN(max_sectors, nb_sectors);
F
Fam Zheng 已提交
243
    assert(nb_sectors);
244
    ret = nb_sectors;
245

F
Fam Zheng 已提交
246 247 248 249 250 251 252 253
    if (s->cow_bitmap) {
        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
    }
    assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size);
    /* The sector range must meet granularity because:
     * 1) Caller passes in aligned values;
     * 2) mirror_cow_align is used only when target cluster is larger. */
    assert(!(sector_num % sectors_per_chunk));
254
    nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk);
F
Fam Zheng 已提交
255 256

    while (s->buf_free_count < nb_chunks) {
257
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
F
Fam Zheng 已提交
258
        mirror_wait_for_io(s);
259 260
    }

261
    /* Allocate a MirrorOp that is used as an AIO callback.  */
262
    op = g_new(MirrorOp, 1);
263 264 265
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;
266 267 268 269 270 271 272

    /* Now make a QEMUIOVector taking enough granularity-sized chunks
     * from s->buf_free.
     */
    qemu_iovec_init(&op->qiov, nb_chunks);
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
F
Fam Zheng 已提交
273
        size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size;
274

275 276
        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
277
        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
278
    }
279

P
Paolo Bonzini 已提交
280
    /* Copy the dirty cluster.  */
281
    s->in_flight++;
M
Max Reitz 已提交
282
    s->sectors_in_flight += nb_sectors;
283
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
284

285
    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
F
Fam Zheng 已提交
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
                   mirror_read_complete, op);
    return ret;
}

static void mirror_do_zero_or_discard(MirrorBlockJob *s,
                                      int64_t sector_num,
                                      int nb_sectors,
                                      bool is_discard)
{
    MirrorOp *op;

    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
     * so the freeing in mirror_iteration_done is nop. */
    op = g_new0(MirrorOp, 1);
    op->s = s;
    op->sector_num = sector_num;
    op->nb_sectors = nb_sectors;

    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    if (is_discard) {
K
Kevin Wolf 已提交
307 308
        blk_aio_discard(s->target, sector_num, op->nb_sectors,
                        mirror_write_complete, op);
F
Fam Zheng 已提交
309
    } else {
K
Kevin Wolf 已提交
310 311
        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
                              op->nb_sectors * BDRV_SECTOR_SIZE,
312 313
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
F
Fam Zheng 已提交
314 315 316 317 318
    }
}

static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
{
K
Kevin Wolf 已提交
319
    BlockDriverState *source = blk_bs(s->common.blk);
320
    int64_t sector_num, first_chunk;
F
Fam Zheng 已提交
321 322 323 324 325 326 327 328 329 330 331 332 333 334
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
    int nb_chunks = 1;
    int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
    int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;

    sector_num = hbitmap_iter_next(&s->hbi);
    if (sector_num < 0) {
        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
        sector_num = hbitmap_iter_next(&s->hbi);
        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
        assert(sector_num >= 0);
    }

335 336
    first_chunk = sector_num / sectors_per_chunk;
    while (test_bit(first_chunk, s->in_flight_bitmap)) {
337
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
338 339 340
        mirror_wait_for_io(s);
    }

341 342
    block_job_pause_point(&s->common);

F
Fam Zheng 已提交
343 344 345 346 347 348 349 350 351 352 353
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
        int64_t hbitmap_next;
        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
        int64_t next_chunk = next_sector / sectors_per_chunk;
        if (next_sector >= end ||
            !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
            break;
        }
        if (test_bit(next_chunk, s->in_flight_bitmap)) {
354
            break;
F
Fam Zheng 已提交
355
        }
356 357

        hbitmap_next = hbitmap_iter_next(&s->hbi);
358 359 360 361 362
        if (hbitmap_next > next_sector || hbitmap_next < 0) {
            /* The bitmap iterator's cache is stale, refresh it */
            bdrv_set_dirty_iter(&s->hbi, next_sector);
            hbitmap_next = hbitmap_iter_next(&s->hbi);
        }
363 364
        assert(hbitmap_next == next_sector);
        nb_chunks++;
F
Fam Zheng 已提交
365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
    }

    /* Clear dirty bits before querying the block status, because
     * calling bdrv_get_block_status_above could yield - if some blocks are
     * marked dirty in this window, we need to know.
     */
    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num,
                            nb_chunks * sectors_per_chunk);
    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
    while (nb_chunks > 0 && sector_num < end) {
        int ret;
        int io_sectors;
        BlockDriverState *file;
        enum MirrorMethod {
            MIRROR_METHOD_COPY,
            MIRROR_METHOD_ZERO,
            MIRROR_METHOD_DISCARD
        } mirror_method = MIRROR_METHOD_COPY;

        assert(!(sector_num % sectors_per_chunk));
        ret = bdrv_get_block_status_above(source, NULL, sector_num,
                                          nb_chunks * sectors_per_chunk,
                                          &io_sectors, &file);
        if (ret < 0) {
            io_sectors = nb_chunks * sectors_per_chunk;
        }

        io_sectors -= io_sectors % sectors_per_chunk;
        if (io_sectors < sectors_per_chunk) {
            io_sectors = sectors_per_chunk;
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_sector_num;
            int target_nb_sectors;
398 399 400
            bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
                                           io_sectors,  &target_sector_num,
                                           &target_nb_sectors);
F
Fam Zheng 已提交
401 402 403 404 405 406 407 408
            if (target_sector_num == sector_num &&
                target_nb_sectors == io_sectors) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
                                    MIRROR_METHOD_ZERO :
                                    MIRROR_METHOD_DISCARD;
            }
        }

409
        mirror_clip_sectors(s, sector_num, &io_sectors);
F
Fam Zheng 已提交
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
        switch (mirror_method) {
        case MIRROR_METHOD_COPY:
            io_sectors = mirror_do_read(s, sector_num, io_sectors);
            break;
        case MIRROR_METHOD_ZERO:
            mirror_do_zero_or_discard(s, sector_num, io_sectors, false);
            break;
        case MIRROR_METHOD_DISCARD:
            mirror_do_zero_or_discard(s, sector_num, io_sectors, true);
            break;
        default:
            abort();
        }
        assert(io_sectors);
        sector_num += io_sectors;
425
        nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
426 427 428
        if (s->common.speed) {
            delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors);
        }
429
    }
430
    return delay_ns;
431
}
432

433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
static void mirror_free_init(MirrorBlockJob *s)
{
    int granularity = s->granularity;
    size_t buf_size = s->buf_size;
    uint8_t *buf = s->buf;

    assert(s->buf_free_count == 0);
    QSIMPLEQ_INIT(&s->buf_free);
    while (buf_size != 0) {
        MirrorBuffer *cur = (MirrorBuffer *)buf;
        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
        s->buf_free_count++;
        buf_size -= granularity;
        buf += granularity;
    }
}

450 451 452
static void mirror_drain(MirrorBlockJob *s)
{
    while (s->in_flight > 0) {
F
Fam Zheng 已提交
453
        mirror_wait_for_io(s);
454
    }
P
Paolo Bonzini 已提交
455 456
}

457 458 459 460 461 462 463 464 465
typedef struct {
    int ret;
} MirrorExitData;

static void mirror_exit(BlockJob *job, void *opaque)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
K
Kevin Wolf 已提交
466 467
    BlockDriverState *src = blk_bs(s->common.blk);
    BlockDriverState *target_bs = blk_bs(s->target);
468 469 470 471

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
    bdrv_ref(src);
472 473 474 475 476 477 478

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);
    }

    if (s->should_complete && data->ret == 0) {
K
Kevin Wolf 已提交
479
        BlockDriverState *to_replace = src;
480 481 482
        if (s->to_replace) {
            to_replace = s->to_replace;
        }
483

K
Kevin Wolf 已提交
484 485
        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
486
        }
487 488 489

        /* The mirror job has no requests in flight any more, but we need to
         * drain potential other users of the BDS before changing the graph. */
K
Kevin Wolf 已提交
490 491 492
        bdrv_drained_begin(target_bs);
        bdrv_replace_in_backing_chain(to_replace, target_bs);
        bdrv_drained_end(target_bs);
493

494 495 496
        /* We just changed the BDS the job BB refers to */
        blk_remove_bs(job->blk);
        blk_insert_bs(job->blk, src);
497 498 499 500 501 502 503 504 505 506
    }
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
        bdrv_unref(s->to_replace);
    }
    if (replace_aio_context) {
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
K
Kevin Wolf 已提交
507 508
    bdrv_op_unblock_all(target_bs, s->common.blocker);
    blk_unref(s->target);
509 510
    block_job_completed(&s->common, data->ret);
    g_free(data);
511
    bdrv_drained_end(src);
512 513 514
    if (qemu_get_aio_context() == bdrv_get_aio_context(src)) {
        aio_enable_external(iohandler_get_aio_context());
    }
515
    bdrv_unref(src);
516 517
}

518 519 520 521 522 523 524 525 526 527 528 529
static void mirror_throttle(MirrorBlockJob *s)
{
    int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);

    if (now - s->last_pause_ns > SLICE_TIME) {
        s->last_pause_ns = now;
        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
    } else {
        block_job_pause_point(&s->common);
    }
}

P
Paolo Bonzini 已提交
530 531 532
static void coroutine_fn mirror_run(void *opaque)
{
    MirrorBlockJob *s = opaque;
533
    MirrorExitData *data;
K
Kevin Wolf 已提交
534 535
    BlockDriverState *bs = blk_bs(s->common.blk);
    BlockDriverState *target_bs = blk_bs(s->target);
536
    int64_t sector_num, end, length;
537
    BlockDriverInfo bdi;
538 539
    char backing_filename[2]; /* we only need 2 characters because we are only
                                 checking for a NULL string */
P
Paolo Bonzini 已提交
540 541
    int ret = 0;
    int n;
F
Fam Zheng 已提交
542
    int target_cluster_size = BDRV_SECTOR_SIZE;
P
Paolo Bonzini 已提交
543 544 545 546 547

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
    }

M
Max Reitz 已提交
548 549 550
    s->bdev_length = bdrv_getlength(bs);
    if (s->bdev_length < 0) {
        ret = s->bdev_length;
551
        goto immediate_exit;
M
Max Reitz 已提交
552
    } else if (s->bdev_length == 0) {
553 554 555 556 557 558 559 560
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
        while (!block_job_is_cancelled(&s->common) && !s->should_complete) {
            block_job_yield(&s->common);
        }
        s->common.cancelled = false;
        goto immediate_exit;
P
Paolo Bonzini 已提交
561 562
    }

M
Max Reitz 已提交
563
    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
564 565
    s->in_flight_bitmap = bitmap_new(length);

566 567 568 569
    /* If we have no backing file yet in the destination, we cannot let
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
K
Kevin Wolf 已提交
570
    bdrv_get_backing_filename(target_bs, backing_filename,
571
                              sizeof(backing_filename));
K
Kevin Wolf 已提交
572
    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
F
Fam Zheng 已提交
573 574
        target_cluster_size = bdi.cluster_size;
    }
K
Kevin Wolf 已提交
575
    if (backing_filename[0] && !target_bs->backing
F
Fam Zheng 已提交
576 577 578
        && s->granularity < target_cluster_size) {
        s->buf_size = MAX(s->buf_size, target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
579
    }
F
Fam Zheng 已提交
580
    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
K
Kevin Wolf 已提交
581
    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
582

M
Max Reitz 已提交
583
    end = s->bdev_length / BDRV_SECTOR_SIZE;
584 585 586 587 588 589
    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
        ret = -ENOMEM;
        goto immediate_exit;
    }

590
    mirror_free_init(s);
P
Paolo Bonzini 已提交
591

592
    s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
F
Fam Zheng 已提交
593
    if (!s->is_none_mode) {
P
Paolo Bonzini 已提交
594
        /* First part, loop on the sectors and initialize the dirty bitmap.  */
F
Fam Zheng 已提交
595
        BlockDriverState *base = s->base;
K
Kevin Wolf 已提交
596
        bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(target_bs);
597

P
Paolo Bonzini 已提交
598
        for (sector_num = 0; sector_num < end; ) {
599 600 601
            /* Just to make sure we are not exceeding int limit. */
            int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
                                 end - sector_num);
602

603
            mirror_throttle(s);
604 605 606 607 608

            if (block_job_is_cancelled(&s->common)) {
                goto immediate_exit;
            }

609
            ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
P
Paolo Bonzini 已提交
610 611 612 613 614 615

            if (ret < 0) {
                goto immediate_exit;
            }

            assert(n > 0);
616
            if (ret == 1 || mark_all_dirty) {
617
                bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
P
Paolo Bonzini 已提交
618
            }
619
            sector_num += n;
P
Paolo Bonzini 已提交
620 621 622
        }
    }

623
    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
P
Paolo Bonzini 已提交
624
    for (;;) {
625
        uint64_t delay_ns = 0;
626
        int64_t cnt, delta;
P
Paolo Bonzini 已提交
627 628
        bool should_complete;

629 630 631 632 633
        if (s->ret < 0) {
            ret = s->ret;
            goto immediate_exit;
        }

634 635
        block_job_pause_point(&s->common);

636
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
M
Max Reitz 已提交
637 638 639 640 641 642
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
         * s->sectors_in_flight is the number of sectors currently being
         * processed; together those are the current total operation length */
        s->common.len = s->common.offset +
                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
643 644

        /* Note that even when no rate limit is applied we need to yield
645
         * periodically with no pending I/O so that bdrv_drain_all() returns.
646 647 648
         * We do so every SLICE_TIME nanoseconds, or when there is an error,
         * or when the source is clean, whichever comes first.
         */
649 650
        delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
        if (delta < SLICE_TIME &&
651
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
652 653 654
            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                (cnt == 0 && s->in_flight > 0)) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
F
Fam Zheng 已提交
655
                mirror_wait_for_io(s);
656 657
                continue;
            } else if (cnt != 0) {
658
                delay_ns = mirror_iteration(s);
P
Paolo Bonzini 已提交
659 660 661 662
            }
        }

        should_complete = false;
663
        if (s->in_flight == 0 && cnt == 0) {
P
Paolo Bonzini 已提交
664
            trace_mirror_before_flush(s);
K
Kevin Wolf 已提交
665
            ret = blk_flush(s->target);
P
Paolo Bonzini 已提交
666
            if (ret < 0) {
W
Wenchao Xia 已提交
667 668
                if (mirror_error_action(s, false, -ret) ==
                    BLOCK_ERROR_ACTION_REPORT) {
669 670 671 672 673 674 675 676 677
                    goto immediate_exit;
                }
            } else {
                /* We're out of the streaming phase.  From now on, if the job
                 * is cancelled we will actually complete all pending I/O and
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
                if (!s->synced) {
678
                    block_job_event_ready(&s->common);
679 680 681 682 683
                    s->synced = true;
                }

                should_complete = s->should_complete ||
                    block_job_is_cancelled(&s->common);
684
                cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
685
            }
P
Paolo Bonzini 已提交
686 687 688 689 690 691 692 693 694 695 696 697
        }

        if (cnt == 0 && should_complete) {
            /* The dirty bitmap is not updated while operations are pending.
             * If we're about to exit, wait for pending operations before
             * calling bdrv_get_dirty_count(bs), or we may exit while the
             * source has dirty data to copy!
             *
             * Note that I/O can be submitted by the guest while
             * mirror_populate runs.
             */
            trace_mirror_before_drain(s, cnt);
698
            bdrv_co_drain(bs);
699
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
P
Paolo Bonzini 已提交
700 701 702
        }

        ret = 0;
703
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
P
Paolo Bonzini 已提交
704
        if (!s->synced) {
705
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
706 707 708 709
            if (block_job_is_cancelled(&s->common)) {
                break;
            }
        } else if (!should_complete) {
710
            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
711
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
P
Paolo Bonzini 已提交
712 713 714 715 716 717 718 719
        } else if (cnt == 0) {
            /* The two disks are in sync.  Exit and report successful
             * completion.
             */
            assert(QLIST_EMPTY(&bs->tracked_requests));
            s->common.cancelled = false;
            break;
        }
720
        s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
P
Paolo Bonzini 已提交
721 722 723
    }

immediate_exit:
724 725 726 727 728 729 730 731 732 733
    if (s->in_flight > 0) {
        /* We get here only if something went wrong.  Either the job failed,
         * or it was cancelled prematurely so that we do not guarantee that
         * the target is a copy of the source.
         */
        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
        mirror_drain(s);
    }

    assert(s->in_flight == 0);
734
    qemu_vfree(s->buf);
735
    g_free(s->cow_bitmap);
736
    g_free(s->in_flight_bitmap);
F
Fam Zheng 已提交
737
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
738 739 740

    data = g_malloc(sizeof(*data));
    data->ret = ret;
741 742
    /* Before we switch to target in mirror_exit, make sure data doesn't
     * change. */
K
Kevin Wolf 已提交
743
    bdrv_drained_begin(bs);
744 745 746 747 748 749
    if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) {
        /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the
         * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we
         * need a block layer API change to achieve this. */
        aio_disable_external(iohandler_get_aio_context());
    }
750
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
P
Paolo Bonzini 已提交
751 752 753 754 755 756 757
}

static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    if (speed < 0) {
758
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
P
Paolo Bonzini 已提交
759 760 761 762 763
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

P
Paolo Bonzini 已提交
764 765 766
static void mirror_complete(BlockJob *job, Error **errp)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
M
Max Reitz 已提交
767 768 769 770
    BlockDriverState *src, *target;

    src = blk_bs(job->blk);
    target = blk_bs(s->target);
P
Paolo Bonzini 已提交
771 772

    if (!s->synced) {
773 774
        error_setg(errp, "The active block job '%s' cannot be completed",
                   job->id);
P
Paolo Bonzini 已提交
775 776 777
        return;
    }

M
Max Reitz 已提交
778 779 780 781 782 783 784 785 786 787
    if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
        int ret;

        assert(!target->backing);
        ret = bdrv_open_backing_file(target, NULL, "backing", errp);
        if (ret < 0) {
            return;
        }
    }

C
Changlong Xie 已提交
788
    /* block all operations on to_replace bs */
789
    if (s->replaces) {
790 791
        AioContext *replace_aio_context;

792
        s->to_replace = bdrv_find_node(s->replaces);
793
        if (!s->to_replace) {
794
            error_setg(errp, "Node name '%s' not found", s->replaces);
795 796 797
            return;
        }

798 799 800
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);

801 802 803 804
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
805 806

        aio_context_release(replace_aio_context);
807 808
    }

M
Max Reitz 已提交
809 810 811 812 813 814 815
    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
        BlockDriverState *backing = s->is_none_mode ? src : s->base;
        if (backing_bs(target) != backing) {
            bdrv_set_backing_hd(target, backing);
        }
    }

P
Paolo Bonzini 已提交
816
    s->should_complete = true;
F
Fam Zheng 已提交
817
    block_job_enter(&s->common);
P
Paolo Bonzini 已提交
818 819
}

820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836
/* There is no matching mirror_resume() because mirror_run() will begin
 * iterating again when the job is resumed.
 */
static void coroutine_fn mirror_pause(BlockJob *job)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    mirror_drain(s);
}

static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
{
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

    blk_set_aio_context(s->target, new_context);
}

837
static const BlockJobDriver mirror_job_driver = {
838 839 840 841 842 843
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_MIRROR,
    .set_speed              = mirror_set_speed,
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
P
Paolo Bonzini 已提交
844 845
};

F
Fam Zheng 已提交
846
static const BlockJobDriver commit_active_job_driver = {
847 848 849 850 851 852
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_COMMIT,
    .set_speed              = mirror_set_speed,
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
F
Fam Zheng 已提交
853 854
};

855 856
static void mirror_start_job(const char *job_id, BlockDriverState *bs,
                             BlockDriverState *target, const char *replaces,
857
                             int64_t speed, uint32_t granularity,
858
                             int64_t buf_size,
M
Max Reitz 已提交
859
                             BlockMirrorBackingMode backing_mode,
860 861
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
862
                             bool unmap,
863
                             BlockCompletionFunc *cb,
864 865 866
                             void *opaque, Error **errp,
                             const BlockJobDriver *driver,
                             bool is_none_mode, BlockDriverState *base)
P
Paolo Bonzini 已提交
867 868 869
{
    MirrorBlockJob *s;

870
    if (granularity == 0) {
871
        granularity = bdrv_get_default_bitmap_granularity(target);
872 873 874 875
    }

    assert ((granularity & (granularity - 1)) == 0);

W
Wen Congyang 已提交
876 877 878 879 880 881 882 883
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
    }

    if (buf_size == 0) {
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }
F
Fam Zheng 已提交
884

885
    s = block_job_create(job_id, driver, bs, speed, cb, opaque, errp);
P
Paolo Bonzini 已提交
886 887 888 889
    if (!s) {
        return;
    }

K
Kevin Wolf 已提交
890 891 892
    s->target = blk_new();
    blk_insert_bs(s->target, target);

893
    s->replaces = g_strdup(replaces);
894 895
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
F
Fam Zheng 已提交
896
    s->is_none_mode = is_none_mode;
M
Max Reitz 已提交
897
    s->backing_mode = backing_mode;
F
Fam Zheng 已提交
898
    s->base = base;
899
    s->granularity = granularity;
W
Wen Congyang 已提交
900
    s->buf_size = ROUND_UP(buf_size, granularity);
901
    s->unmap = unmap;
902

903
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
904
    if (!s->dirty_bitmap) {
905
        g_free(s->replaces);
K
Kevin Wolf 已提交
906
        blk_unref(s->target);
907
        block_job_unref(&s->common);
908 909
        return;
    }
910

K
Kevin Wolf 已提交
911
    bdrv_op_block_all(target, s->common.blocker);
912

913
    s->common.co = qemu_coroutine_create(mirror_run, s);
P
Paolo Bonzini 已提交
914
    trace_mirror_start(bs, s, s->common.co, opaque);
915
    qemu_coroutine_enter(s->common.co);
P
Paolo Bonzini 已提交
916
}
F
Fam Zheng 已提交
917

918 919
void mirror_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *target, const char *replaces,
920
                  int64_t speed, uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
921 922
                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                  BlockdevOnError on_source_error,
F
Fam Zheng 已提交
923
                  BlockdevOnError on_target_error,
924
                  bool unmap,
925
                  BlockCompletionFunc *cb,
F
Fam Zheng 已提交
926 927 928 929 930
                  void *opaque, Error **errp)
{
    bool is_none_mode;
    BlockDriverState *base;

931 932
    if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        error_setg(errp, "Sync mode 'incremental' not supported");
933 934
        return;
    }
F
Fam Zheng 已提交
935
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
936
    base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
937
    mirror_start_job(job_id, bs, target, replaces,
M
Max Reitz 已提交
938
                     speed, granularity, buf_size, backing_mode,
939
                     on_source_error, on_target_error, unmap, cb, opaque, errp,
F
Fam Zheng 已提交
940 941 942
                     &mirror_job_driver, is_none_mode, base);
}

943 944
void commit_active_start(const char *job_id, BlockDriverState *bs,
                         BlockDriverState *base, int64_t speed,
F
Fam Zheng 已提交
945
                         BlockdevOnError on_error,
946
                         BlockCompletionFunc *cb,
F
Fam Zheng 已提交
947 948
                         void *opaque, Error **errp)
{
949 950
    int64_t length, base_length;
    int orig_base_flags;
951
    int ret;
952
    Error *local_err = NULL;
953 954 955

    orig_base_flags = bdrv_get_flags(base);

F
Fam Zheng 已提交
956 957 958
    if (bdrv_reopen(base, bs->open_flags, errp)) {
        return;
    }
959 960 961

    length = bdrv_getlength(bs);
    if (length < 0) {
962 963
        error_setg_errno(errp, -length,
                         "Unable to determine length of %s", bs->filename);
964 965 966 967 968
        goto error_restore_flags;
    }

    base_length = bdrv_getlength(base);
    if (base_length < 0) {
969 970
        error_setg_errno(errp, -base_length,
                         "Unable to determine length of %s", base->filename);
971 972 973 974
        goto error_restore_flags;
    }

    if (length > base_length) {
975 976 977 978
        ret = bdrv_truncate(base, length);
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                            "Top image %s is larger than base image %s, and "
979 980 981 982 983 984
                             "resize of base image failed",
                             bs->filename, base->filename);
            goto error_restore_flags;
        }
    }

985
    mirror_start_job(job_id, bs, base, NULL, speed, 0, 0,
986
                     MIRROR_LEAVE_BACKING_CHAIN,
987
                     on_error, on_error, false, cb, opaque, &local_err,
F
Fam Zheng 已提交
988
                     &commit_active_job_driver, false, base);
989
    if (local_err) {
990
        error_propagate(errp, local_err);
991 992 993 994 995 996 997 998 999 1000
        goto error_restore_flags;
    }

    return;

error_restore_flags:
    /* ignore error and errp for bdrv_reopen, because we want to propagate
     * the original error */
    bdrv_reopen(base, orig_base_flags, NULL);
    return;
F
Fam Zheng 已提交
1001
}