backup.c 11.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
/*
 * QEMU backup
 *
 * Copyright (C) 2013 Proxmox Server Solutions
 *
 * Authors:
 *  Dietmar Maurer (dietmar@proxmox.com)
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 *
 */

#include <stdio.h>
#include <errno.h>
#include <unistd.h>

#include "trace.h"
#include "block/block.h"
#include "block/block_int.h"
#include "block/blockjob.h"
#include "qemu/ratelimit.h"

#define BACKUP_CLUSTER_BITS 16
#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
#define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE)

#define SLICE_TIME 100000000ULL /* ns */

typedef struct CowRequest {
    int64_t start;
    int64_t end;
    QLIST_ENTRY(CowRequest) list;
    CoQueue wait_queue; /* coroutines blocked on this request */
} CowRequest;

typedef struct BackupBlockJob {
    BlockJob common;
    BlockDriverState *target;
40
    MirrorSyncMode sync_mode;
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
    RateLimit limit;
    BlockdevOnError on_source_error;
    BlockdevOnError on_target_error;
    CoRwlock flush_rwlock;
    uint64_t sectors_read;
    HBitmap *bitmap;
    QLIST_HEAD(, CowRequest) inflight_reqs;
} BackupBlockJob;

/* See if in-flight requests overlap and wait for them to complete */
static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
                                                       int64_t start,
                                                       int64_t end)
{
    CowRequest *req;
    bool retry;

    do {
        retry = false;
        QLIST_FOREACH(req, &job->inflight_reqs, list) {
            if (end > req->start && start < req->end) {
                qemu_co_queue_wait(&req->wait_queue);
                retry = true;
                break;
            }
        }
    } while (retry);
}

/* Keep track of an in-flight request */
static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
                                     int64_t start, int64_t end)
{
    req->start = start;
    req->end = end;
    qemu_co_queue_init(&req->wait_queue);
    QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
}

/* Forget about a completed request */
static void cow_request_end(CowRequest *req)
{
    QLIST_REMOVE(req, list);
    qemu_co_queue_restart_all(&req->wait_queue);
}

static int coroutine_fn backup_do_cow(BlockDriverState *bs,
                                      int64_t sector_num, int nb_sectors,
                                      bool *error_is_read)
{
    BackupBlockJob *job = (BackupBlockJob *)bs->job;
    CowRequest cow_request;
    struct iovec iov;
    QEMUIOVector bounce_qiov;
    void *bounce_buffer = NULL;
    int ret = 0;
    int64_t start, end;
    int n;

    qemu_co_rwlock_rdlock(&job->flush_rwlock);

    start = sector_num / BACKUP_SECTORS_PER_CLUSTER;
    end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER);

    trace_backup_do_cow_enter(job, start, sector_num, nb_sectors);

    wait_for_overlapping_requests(job, start, end);
    cow_request_begin(&cow_request, job, start, end);

    for (; start < end; start++) {
        if (hbitmap_get(job->bitmap, start)) {
            trace_backup_do_cow_skip(job, start);
            continue; /* already copied */
        }

        trace_backup_do_cow_process(job, start);

        n = MIN(BACKUP_SECTORS_PER_CLUSTER,
                job->common.len / BDRV_SECTOR_SIZE -
                start * BACKUP_SECTORS_PER_CLUSTER);

        if (!bounce_buffer) {
            bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE);
        }
        iov.iov_base = bounce_buffer;
        iov.iov_len = n * BDRV_SECTOR_SIZE;
        qemu_iovec_init_external(&bounce_qiov, &iov, 1);

        ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
                            &bounce_qiov);
        if (ret < 0) {
            trace_backup_do_cow_read_fail(job, start, ret);
            if (error_is_read) {
                *error_is_read = true;
            }
            goto out;
        }

        if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
            ret = bdrv_co_write_zeroes(job->target,
                                       start * BACKUP_SECTORS_PER_CLUSTER, n);
        } else {
            ret = bdrv_co_writev(job->target,
                                 start * BACKUP_SECTORS_PER_CLUSTER, n,
                                 &bounce_qiov);
        }
        if (ret < 0) {
            trace_backup_do_cow_write_fail(job, start, ret);
            if (error_is_read) {
                *error_is_read = false;
            }
            goto out;
        }

        hbitmap_set(job->bitmap, start, 1);

        /* Publish progress, guest I/O counts as progress too.  Note that the
         * offset field is an opaque progress value, it is not a disk offset.
         */
        job->sectors_read += n;
        job->common.offset += n * BDRV_SECTOR_SIZE;
    }

out:
    if (bounce_buffer) {
        qemu_vfree(bounce_buffer);
    }

    cow_request_end(&cow_request);

    trace_backup_do_cow_return(job, sector_num, nb_sectors, ret);

    qemu_co_rwlock_unlock(&job->flush_rwlock);

    return ret;
}

static int coroutine_fn backup_before_write_notify(
        NotifierWithReturn *notifier,
        void *opaque)
{
    BdrvTrackedRequest *req = opaque;

    return backup_do_cow(req->bs, req->sector_num, req->nb_sectors, NULL);
}

static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    BackupBlockJob *s = container_of(job, BackupBlockJob, common);

    if (speed < 0) {
        error_set(errp, QERR_INVALID_PARAMETER, "speed");
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

static void backup_iostatus_reset(BlockJob *job)
{
    BackupBlockJob *s = container_of(job, BackupBlockJob, common);

    bdrv_iostatus_reset(s->target);
}

static const BlockJobType backup_job_type = {
    .instance_size  = sizeof(BackupBlockJob),
    .job_type       = "backup",
    .set_speed      = backup_set_speed,
    .iostatus_reset = backup_iostatus_reset,
};

static BlockErrorAction backup_error_action(BackupBlockJob *job,
                                            bool read, int error)
{
    if (read) {
        return block_job_error_action(&job->common, job->common.bs,
                                      job->on_source_error, true, error);
    } else {
        return block_job_error_action(&job->common, job->target,
                                      job->on_target_error, false, error);
    }
}

static void coroutine_fn backup_run(void *opaque)
{
    BackupBlockJob *job = opaque;
    BlockDriverState *bs = job->common.bs;
    BlockDriverState *target = job->target;
    BlockdevOnError on_target_error = job->on_target_error;
    NotifierWithReturn before_write = {
        .notify = backup_before_write_notify,
    };
    int64_t start, end;
    int ret = 0;

    QLIST_INIT(&job->inflight_reqs);
    qemu_co_rwlock_init(&job->flush_rwlock);

    start = 0;
    end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE,
                       BACKUP_SECTORS_PER_CLUSTER);

    job->bitmap = hbitmap_alloc(end, 0);

    bdrv_set_enable_write_cache(target, true);
    bdrv_set_on_error(target, on_target_error, on_target_error);
    bdrv_iostatus_enable(target);

    bdrv_add_before_write_notifier(bs, &before_write);

251 252 253 254 255 256 257
    if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
        while (!block_job_is_cancelled(&job->common)) {
            /* Yield until the job is cancelled.  We just let our before_write
             * notify callback service CoW requests. */
            job->common.busy = false;
            qemu_coroutine_yield();
            job->common.busy = true;
258
        }
259 260 261 262
    } else {
        /* Both FULL and TOP SYNC_MODE's require copying.. */
        for (; start < end; start++) {
            bool error_is_read;
263

264 265 266
            if (block_job_is_cancelled(&job->common)) {
                break;
            }
267

268 269 270 271 272 273 274
            /* we need to yield so that qemu_aio_flush() returns.
             * (without, VM does not reboot)
             */
            if (job->common.speed) {
                uint64_t delay_ns = ratelimit_calculate_delay(
                        &job->limit, job->sectors_read);
                job->sectors_read = 0;
275
                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
276
            } else {
277
                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
278
            }
279

280
            if (block_job_is_cancelled(&job->common)) {
281
                break;
282 283 284 285 286 287 288 289 290 291 292
            }

            if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
                int i, n;
                int alloced = 0;

                /* Check to see if these blocks are already in the
                 * backing file. */

                for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) {
                    /* bdrv_co_is_allocated() only returns true/false based
293
                     * on the first set of sectors it comes across that
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
                     * are are all in the same state.
                     * For that reason we must verify each sector in the
                     * backup cluster length.  We end up copying more than
                     * needed but at some point that is always the case. */
                    alloced =
                        bdrv_co_is_allocated(bs,
                                start * BACKUP_SECTORS_PER_CLUSTER + i,
                                BACKUP_SECTORS_PER_CLUSTER - i, &n);
                    i += n;

                    if (alloced == 1) {
                        break;
                    }
                }

                /* If the above loop never found any sectors that are in
                 * the topmost image, skip this backup. */
                if (alloced == 0) {
                    continue;
                }
            }
            /* FULL sync mode we copy the whole drive. */
            ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
            if (ret < 0) {
                /* Depending on error action, fail now or retry cluster */
                BlockErrorAction action =
                    backup_error_action(job, error_is_read, -ret);
                if (action == BDRV_ACTION_REPORT) {
                    break;
                } else {
                    start--;
                    continue;
                }
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
            }
        }
    }

    notifier_with_return_remove(&before_write);

    /* wait until pending backup_do_cow() calls have completed */
    qemu_co_rwlock_wrlock(&job->flush_rwlock);
    qemu_co_rwlock_unlock(&job->flush_rwlock);

    hbitmap_free(job->bitmap);

    bdrv_iostatus_disable(target);
    bdrv_delete(target);

    block_job_completed(&job->common, ret);
}

void backup_start(BlockDriverState *bs, BlockDriverState *target,
347
                  int64_t speed, MirrorSyncMode sync_mode,
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
                  BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
                  BlockDriverCompletionFunc *cb, void *opaque,
                  Error **errp)
{
    int64_t len;

    assert(bs);
    assert(target);
    assert(cb);

    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
        !bdrv_iostatus_is_enabled(bs)) {
        error_set(errp, QERR_INVALID_PARAMETER, "on-source-error");
        return;
    }

    len = bdrv_getlength(bs);
    if (len < 0) {
        error_setg_errno(errp, -len, "unable to get length for '%s'",
                         bdrv_get_device_name(bs));
        return;
    }

    BackupBlockJob *job = block_job_create(&backup_job_type, bs, speed,
                                           cb, opaque, errp);
    if (!job) {
        return;
    }

    job->on_source_error = on_source_error;
    job->on_target_error = on_target_error;
    job->target = target;
382
    job->sync_mode = sync_mode;
383 384 385 386
    job->common.len = len;
    job->common.co = qemu_coroutine_create(backup_run);
    qemu_coroutine_enter(job->common.co, job);
}