commit.c 15.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * Live block commit
 *
 * Copyright Red Hat, Inc. 2012
 *
 * Authors:
 *  Jeff Cody   <jcody@redhat.com>
 *  Based on stream.c by Stefan Hajnoczi
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
15
#include "qemu/osdep.h"
16
#include "trace.h"
17
#include "block/block_int.h"
18
#include "block/blockjob_int.h"
19
#include "qapi/error.h"
20
#include "qapi/qmp/qerror.h"
21
#include "qemu/ratelimit.h"
22
#include "sysemu/block-backend.h"
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38

enum {
    /*
     * Size of data buffer for populating the image file.  This should be large
     * enough to process multiple clusters in a single call, so that populating
     * contiguous regions of the image is efficient.
     */
    COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
};

#define SLICE_TIME 100000000ULL /* ns */

typedef struct CommitBlockJob {
    BlockJob common;
    RateLimit limit;
    BlockDriverState *active;
39
    BlockDriverState *commit_top_bs;
K
Kevin Wolf 已提交
40 41
    BlockBackend *top;
    BlockBackend *base;
42
    BlockdevOnError on_error;
43 44
    int base_flags;
    int orig_overlay_flags;
45
    char *backing_file_str;
46 47
} CommitBlockJob;

K
Kevin Wolf 已提交
48
static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
49 50 51 52
                                        int64_t sector_num, int nb_sectors,
                                        void *buf)
{
    int ret = 0;
K
Kevin Wolf 已提交
53 54 55 56 57
    QEMUIOVector qiov;
    struct iovec iov = {
        .iov_base = buf,
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
    };
58

K
Kevin Wolf 已提交
59 60 61 62 63
    qemu_iovec_init_external(&qiov, &iov, 1);

    ret = blk_co_preadv(bs, sector_num * BDRV_SECTOR_SIZE,
                        qiov.size, &qiov, 0);
    if (ret < 0) {
64 65 66
        return ret;
    }

K
Kevin Wolf 已提交
67 68 69
    ret = blk_co_pwritev(base, sector_num * BDRV_SECTOR_SIZE,
                         qiov.size, &qiov, 0);
    if (ret < 0) {
70 71 72 73 74 75
        return ret;
    }

    return 0;
}

76 77 78 79 80
typedef struct {
    int ret;
} CommitCompleteData;

static void commit_complete(BlockJob *job, void *opaque)
81
{
82 83
    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
    CommitCompleteData *data = opaque;
84
    BlockDriverState *active = s->active;
K
Kevin Wolf 已提交
85 86
    BlockDriverState *top = blk_bs(s->top);
    BlockDriverState *base = blk_bs(s->base);
87
    BlockDriverState *overlay_bs = bdrv_find_overlay(active, s->commit_top_bs);
88
    int ret = data->ret;
89 90 91 92 93
    bool remove_commit_top_bs = false;

    /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
     * the normal backing chain can be restored. */
    blk_unref(s->base);
94 95 96

    if (!block_job_is_cancelled(&s->common) && ret == 0) {
        /* success */
97 98 99 100 101 102 103
        ret = bdrv_drop_intermediate(active, s->commit_top_bs, base,
                                     s->backing_file_str);
    } else if (overlay_bs) {
        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
         * after the failed/cancelled commit job is gone? If we already wrote
         * something to base, the intermediate images aren't valid any more. */
        remove_commit_top_bs = true;
104 105 106 107 108 109 110 111 112 113 114 115
    }

    /* restore base open flags here if appropriate (e.g., change the base back
     * to r/o). These reopens do not need to be atomic, since we won't abort
     * even on failure here */
    if (s->base_flags != bdrv_get_flags(base)) {
        bdrv_reopen(base, s->base_flags, NULL);
    }
    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
    }
    g_free(s->backing_file_str);
K
Kevin Wolf 已提交
116
    blk_unref(s->top);
117 118
    block_job_completed(&s->common, ret);
    g_free(data);
119 120 121 122 123 124 125

    /* If bdrv_drop_intermediate() didn't already do that, remove the commit
     * filter driver from the backing chain. Do this as the final step so that
     * the 'consistent read' permission can be granted.  */
    if (remove_commit_top_bs) {
        bdrv_set_backing_hd(overlay_bs, top);
    }
126 127 128 129 130 131
}

static void coroutine_fn commit_run(void *opaque)
{
    CommitBlockJob *s = opaque;
    CommitCompleteData *data;
132
    int64_t sector_num, end;
133
    uint64_t delay_ns = 0;
134 135
    int ret = 0;
    int n = 0;
136
    void *buf = NULL;
137 138 139
    int bytes_written = 0;
    int64_t base_len;

K
Kevin Wolf 已提交
140
    ret = s->common.len = blk_getlength(s->top);
141 142 143


    if (s->common.len < 0) {
144
        goto out;
145 146
    }

K
Kevin Wolf 已提交
147
    ret = base_len = blk_getlength(s->base);
148
    if (base_len < 0) {
149
        goto out;
150 151 152
    }

    if (base_len < s->common.len) {
K
Kevin Wolf 已提交
153
        ret = blk_truncate(s->base, s->common.len);
154
        if (ret) {
155
            goto out;
156 157 158 159
        }
    }

    end = s->common.len >> BDRV_SECTOR_BITS;
K
Kevin Wolf 已提交
160
    buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
161 162 163 164 165

    for (sector_num = 0; sector_num < end; sector_num += n) {
        bool copy;

        /* Note that even when no rate limit is applied we need to yield
K
Kevin Wolf 已提交
166
         * with no pending I/O here so that bdrv_drain_all() returns.
167
         */
168
        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
169 170 171 172
        if (block_job_is_cancelled(&s->common)) {
            break;
        }
        /* Copy if allocated above the base */
K
Kevin Wolf 已提交
173 174
        ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base),
                                      sector_num,
175 176
                                      COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
                                      &n);
177 178 179
        copy = (ret == 1);
        trace_commit_one_iteration(s, sector_num, n, ret);
        if (copy) {
K
Kevin Wolf 已提交
180
            ret = commit_populate(s->top, s->base, sector_num, n, buf);
181 182 183
            bytes_written += n * BDRV_SECTOR_SIZE;
        }
        if (ret < 0) {
184 185 186
            BlockErrorAction action =
                block_job_error_action(&s->common, false, s->on_error, -ret);
            if (action == BLOCK_ERROR_ACTION_REPORT) {
187
                goto out;
188 189 190 191 192 193 194
            } else {
                n = 0;
                continue;
            }
        }
        /* Publish progress */
        s->common.offset += n * BDRV_SECTOR_SIZE;
195 196 197 198

        if (copy && s->common.speed) {
            delay_ns = ratelimit_calculate_delay(&s->limit, n);
        }
199 200 201 202
    }

    ret = 0;

203
out:
204 205
    qemu_vfree(buf);

206 207 208
    data = g_malloc(sizeof(*data));
    data->ret = ret;
    block_job_defer_to_main_loop(&s->common, commit_complete, data);
209 210 211 212 213 214 215
}

static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    CommitBlockJob *s = container_of(job, CommitBlockJob, common);

    if (speed < 0) {
216
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
217 218 219 220 221
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

222
static const BlockJobDriver commit_job_driver = {
223
    .instance_size = sizeof(CommitBlockJob),
F
Fam Zheng 已提交
224
    .job_type      = BLOCK_JOB_TYPE_COMMIT,
225
    .set_speed     = commit_set_speed,
J
John Snow 已提交
226
    .start         = commit_run,
227 228
};

229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
static int coroutine_fn bdrv_commit_top_preadv(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
{
    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
}

static void bdrv_commit_top_close(BlockDriverState *bs)
{
}

static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
                                       const BdrvChildRole *role,
                                       uint64_t perm, uint64_t shared,
                                       uint64_t *nperm, uint64_t *nshared)
{
    *nperm = 0;
    *nshared = BLK_PERM_ALL;
}

/* Dummy node that provides consistent read to its users without requiring it
 * from its backing file and that allows writes on the backing file chain. */
static BlockDriver bdrv_commit_top = {
    .format_name        = "commit_top",
    .bdrv_co_preadv     = bdrv_commit_top_preadv,
    .bdrv_close         = bdrv_commit_top_close,
    .bdrv_child_perm    = bdrv_commit_top_child_perm,
};

257 258
void commit_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *base, BlockDriverState *top, int64_t speed,
259 260
                  BlockdevOnError on_error, const char *backing_file_str,
                  Error **errp)
261 262 263 264 265
{
    CommitBlockJob *s;
    BlockReopenQueue *reopen_queue = NULL;
    int orig_overlay_flags;
    int orig_base_flags;
266
    BlockDriverState *iter;
267
    BlockDriverState *overlay_bs;
268
    BlockDriverState *commit_top_bs = NULL;
269
    Error *local_err = NULL;
270
    int ret;
271

F
Fam Zheng 已提交
272
    assert(top != bs);
273 274 275 276 277 278 279 280 281 282 283 284
    if (top == base) {
        error_setg(errp, "Invalid files for merge: top and base are the same");
        return;
    }

    overlay_bs = bdrv_find_overlay(bs, top);

    if (overlay_bs == NULL) {
        error_setg(errp, "Could not find overlay image for %s:", top->filename);
        return;
    }

285 286
    s = block_job_create(job_id, &commit_job_driver, bs, 0, BLK_PERM_ALL,
                         speed, BLOCK_JOB_DEFAULT, NULL, NULL, errp);
287 288 289 290
    if (!s) {
        return;
    }

291 292 293 294
    orig_base_flags    = bdrv_get_flags(base);
    orig_overlay_flags = bdrv_get_flags(overlay_bs);

    /* convert base & overlay_bs to r/w, if necessary */
295 296 297 298
    if (!(orig_base_flags & BDRV_O_RDWR)) {
        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
                                         orig_base_flags | BDRV_O_RDWR);
    }
299 300 301 302
    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, NULL,
                                         orig_overlay_flags | BDRV_O_RDWR);
    }
303
    if (reopen_queue) {
304
        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
305 306
        if (local_err != NULL) {
            error_propagate(errp, local_err);
307
            goto fail;
308 309 310
        }
    }

311 312 313 314 315 316 317 318 319 320 321 322
    /* Insert commit_top block node above top, so we can block consistent read
     * on the backing chain below it */
    commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, 0, errp);
    if (commit_top_bs == NULL) {
        goto fail;
    }

    bdrv_set_backing_hd(commit_top_bs, top);
    bdrv_set_backing_hd(overlay_bs, commit_top_bs);

    s->commit_top_bs = commit_top_bs;
    bdrv_unref(commit_top_bs);
323

324 325 326
    /* Block all nodes between top and base, because they will
     * disappear from the chain after this operation. */
    assert(bdrv_chain_contains(top, base));
327 328 329 330 331 332 333 334 335 336 337
    for (iter = top; iter != base; iter = backing_bs(iter)) {
        /* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
         * at s->base (if writes are blocked for a node, they are also blocked
         * for its backing file). The other options would be a second filter
         * driver above s->base. */
        ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
                                 BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
                                 errp);
        if (ret < 0) {
            goto fail;
        }
338
    }
339 340 341 342 343 344

    ret = block_job_add_bdrv(&s->common, "base", base, 0, BLK_PERM_ALL, errp);
    if (ret < 0) {
        goto fail;
    }

345
    /* overlay_bs must be blocked because it needs to be modified to
346 347 348 349 350
     * update the backing image string. */
    ret = block_job_add_bdrv(&s->common, "overlay of top", overlay_bs,
                             BLK_PERM_GRAPH_MOD, BLK_PERM_ALL, errp);
    if (ret < 0) {
        goto fail;
351 352
    }

353 354 355 356 357 358
    s->base = blk_new(BLK_PERM_CONSISTENT_READ
                      | BLK_PERM_WRITE
                      | BLK_PERM_RESIZE,
                      BLK_PERM_CONSISTENT_READ
                      | BLK_PERM_GRAPH_MOD
                      | BLK_PERM_WRITE_UNCHANGED);
359 360 361 362
    ret = blk_insert_bs(s->base, base, errp);
    if (ret < 0) {
        goto fail;
    }
K
Kevin Wolf 已提交
363

364
    /* Required permissions are already taken with block_job_add_bdrv() */
K
Kevin Wolf 已提交
365
    s->top = blk_new(0, BLK_PERM_ALL);
366
    blk_insert_bs(s->top, top, errp);
367 368 369
    if (ret < 0) {
        goto fail;
    }
K
Kevin Wolf 已提交
370

371 372 373 374 375
    s->active = bs;

    s->base_flags          = orig_base_flags;
    s->orig_overlay_flags  = orig_overlay_flags;

376 377
    s->backing_file_str = g_strdup(backing_file_str);

378 379
    s->on_error = on_error;

J
John Snow 已提交
380 381
    trace_commit_start(bs, base, top, s);
    block_job_start(&s->common);
382 383 384 385 386 387 388 389 390
    return;

fail:
    if (s->base) {
        blk_unref(s->base);
    }
    if (s->top) {
        blk_unref(s->top);
    }
391 392 393
    if (commit_top_bs) {
        bdrv_set_backing_hd(overlay_bs, top);
    }
394
    block_job_unref(&s->common);
395
}
396 397 398 399 400 401 402


#define COMMIT_BUF_SECTORS 2048

/* commit COW file into the raw image */
int bdrv_commit(BlockDriverState *bs)
{
403
    BlockBackend *src, *backing;
404 405
    BlockDriverState *backing_file_bs = NULL;
    BlockDriverState *commit_top_bs = NULL;
406 407 408 409 410
    BlockDriver *drv = bs->drv;
    int64_t sector, total_sectors, length, backing_length;
    int n, ro, open_flags;
    int ret = 0;
    uint8_t *buf = NULL;
411
    Error *local_err = NULL;
412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433

    if (!drv)
        return -ENOMEDIUM;

    if (!bs->backing) {
        return -ENOTSUP;
    }

    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
        bdrv_op_is_blocked(bs->backing->bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
        return -EBUSY;
    }

    ro = bs->backing->bs->read_only;
    open_flags =  bs->backing->bs->open_flags;

    if (ro) {
        if (bdrv_reopen(bs->backing->bs, open_flags | BDRV_O_RDWR, NULL)) {
            return -EACCES;
        }
    }

434 435
    src = blk_new(BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
    backing = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
436

437
    ret = blk_insert_bs(src, bs, &local_err);
438
    if (ret < 0) {
439 440 441 442 443 444 445 446 447 448 449
        error_report_err(local_err);
        goto ro_cleanup;
    }

    /* Insert commit_top block node above backing, so we can write to it */
    backing_file_bs = backing_bs(bs);

    commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, BDRV_O_RDWR,
                                         &local_err);
    if (commit_top_bs == NULL) {
        error_report_err(local_err);
450 451 452
        goto ro_cleanup;
    }

453 454 455 456
    bdrv_set_backing_hd(commit_top_bs, backing_file_bs);
    bdrv_set_backing_hd(bs, commit_top_bs);

    ret = blk_insert_bs(backing, backing_file_bs, &local_err);
457
    if (ret < 0) {
458
        error_report_err(local_err);
459 460
        goto ro_cleanup;
    }
461 462

    length = blk_getlength(src);
463 464 465 466 467
    if (length < 0) {
        ret = length;
        goto ro_cleanup;
    }

468
    backing_length = blk_getlength(backing);
469 470 471 472 473 474 475 476 477
    if (backing_length < 0) {
        ret = backing_length;
        goto ro_cleanup;
    }

    /* If our top snapshot is larger than the backing file image,
     * grow the backing file image if possible.  If not possible,
     * we must return an error */
    if (length > backing_length) {
478
        ret = blk_truncate(backing, length);
479 480 481 482 483 484 485
        if (ret < 0) {
            goto ro_cleanup;
        }
    }

    total_sectors = length >> BDRV_SECTOR_BITS;

486 487 488
    /* blk_try_blockalign() for src will choose an alignment that works for
     * backing as well, so no need to compare the alignment manually. */
    buf = blk_try_blockalign(src, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
489 490 491 492 493 494 495 496 497 498 499
    if (buf == NULL) {
        ret = -ENOMEM;
        goto ro_cleanup;
    }

    for (sector = 0; sector < total_sectors; sector += n) {
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
        if (ret < 0) {
            goto ro_cleanup;
        }
        if (ret) {
500 501
            ret = blk_pread(src, sector * BDRV_SECTOR_SIZE, buf,
                            n * BDRV_SECTOR_SIZE);
502 503 504 505
            if (ret < 0) {
                goto ro_cleanup;
            }

506 507
            ret = blk_pwrite(backing, sector * BDRV_SECTOR_SIZE, buf,
                             n * BDRV_SECTOR_SIZE, 0);
508 509 510 511 512 513 514 515 516 517 518
            if (ret < 0) {
                goto ro_cleanup;
            }
        }
    }

    if (drv->bdrv_make_empty) {
        ret = drv->bdrv_make_empty(bs);
        if (ret < 0) {
            goto ro_cleanup;
        }
519
        blk_flush(src);
520 521 522 523 524 525
    }

    /*
     * Make sure all data we wrote to the backing device is actually
     * stable on disk.
     */
526
    blk_flush(backing);
527 528 529 530 531

    ret = 0;
ro_cleanup:
    qemu_vfree(buf);

532
    blk_unref(backing);
533 534 535 536 537
    if (backing_file_bs) {
        bdrv_set_backing_hd(bs, backing_file_bs);
    }
    bdrv_unref(commit_top_bs);
    blk_unref(src);
538

539 540 541 542 543 544 545
    if (ro) {
        /* ignoring error return here */
        bdrv_reopen(bs->backing->bs, open_flags & ~BDRV_O_RDWR, NULL);
    }

    return ret;
}