block-migration.c 19.8 KB
Newer Older
L
lirans@il.ibm.com 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 * QEMU live block migration
 *
 * Copyright IBM, Corp. 2009
 *
 * Authors:
 *  Liran Schour   <lirans@il.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

#include "qemu-common.h"
#include "block_int.h"
#include "hw/hw.h"
17
#include "qemu-queue.h"
18
#include "qemu-timer.h"
19
#include "monitor.h"
L
lirans@il.ibm.com 已提交
20
#include "block-migration.h"
21
#include "migration.h"
22
#include "blockdev.h"
L
lirans@il.ibm.com 已提交
23 24
#include <assert.h>

J
Jan Kiszka 已提交
25
#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
L
lirans@il.ibm.com 已提交
26 27 28

#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
#define BLK_MIG_FLAG_EOS                0x02
29
#define BLK_MIG_FLAG_PROGRESS           0x04
L
lirans@il.ibm.com 已提交
30 31 32 33 34 35

#define MAX_IS_ALLOCATED_SEARCH 65536

//#define DEBUG_BLK_MIGRATION

#ifdef DEBUG_BLK_MIGRATION
M
malc 已提交
36
#define DPRINTF(fmt, ...) \
L
lirans@il.ibm.com 已提交
37 38
    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
#else
M
malc 已提交
39
#define DPRINTF(fmt, ...) \
L
lirans@il.ibm.com 已提交
40 41 42
    do { } while (0)
#endif

43 44 45 46 47
typedef struct BlkMigDevState {
    BlockDriverState *bs;
    int bulk_completed;
    int shared_base;
    int64_t cur_sector;
48
    int64_t cur_dirty;
49
    int64_t completed_sectors;
50 51
    int64_t total_sectors;
    int64_t dirty;
52
    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
53
    unsigned long *aio_bitmap;
54 55
} BlkMigDevState;

L
lirans@il.ibm.com 已提交
56 57 58 59
typedef struct BlkMigBlock {
    uint8_t *buf;
    BlkMigDevState *bmds;
    int64_t sector;
60
    int nr_sectors;
L
lirans@il.ibm.com 已提交
61 62 63 64
    struct iovec iov;
    QEMUIOVector qiov;
    BlockDriverAIOCB *aiocb;
    int ret;
65
    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
L
lirans@il.ibm.com 已提交
66 67 68 69 70
} BlkMigBlock;

typedef struct BlkMigState {
    int blk_enable;
    int shared_base;
71 72
    QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
    QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
L
lirans@il.ibm.com 已提交
73 74 75
    int submitted;
    int read_done;
    int transferred;
76
    int64_t total_sector_sum;
77
    int prev_progress;
L
Liran Schour 已提交
78
    int bulk_completed;
79
    long double total_time;
80
    long double prev_time_offset;
81
    int reads;
L
lirans@il.ibm.com 已提交
82 83
} BlkMigState;

84
static BlkMigState block_mig_state;
L
lirans@il.ibm.com 已提交
85

86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
static void blk_send(QEMUFile *f, BlkMigBlock * blk)
{
    int len;

    /* sector number and flags */
    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
                     | BLK_MIG_FLAG_DEVICE_BLOCK);

    /* device name */
    len = strlen(blk->bmds->bs->device_name);
    qemu_put_byte(f, len);
    qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);

    qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
}

102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
int blk_mig_active(void)
{
    return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
}

uint64_t blk_mig_bytes_transferred(void)
{
    BlkMigDevState *bmds;
    uint64_t sum = 0;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        sum += bmds->completed_sectors;
    }
    return sum << BDRV_SECTOR_BITS;
}

uint64_t blk_mig_bytes_remaining(void)
{
    return blk_mig_bytes_total() - blk_mig_bytes_transferred();
}

uint64_t blk_mig_bytes_total(void)
{
    BlkMigDevState *bmds;
    uint64_t sum = 0;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        sum += bmds->total_sectors;
    }
    return sum << BDRV_SECTOR_BITS;
}

134 135 136
static inline long double compute_read_bwidth(void)
{
    assert(block_mig_state.total_time != 0);
137
    return (block_mig_state.reads / block_mig_state.total_time) * BLOCK_SIZE;
138 139
}

140 141 142 143
static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
{
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;

144
    if ((sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
    } else {
        return 0;
    }
}

static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
                             int nb_sectors, int set)
{
    int64_t start, end;
    unsigned long val, idx, bit;

    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;

    for (; start <= end; start++) {
        idx = start / (sizeof(unsigned long) * 8);
        bit = start % (sizeof(unsigned long) * 8);
        val = bmds->aio_bitmap[idx];
        if (set) {
166
            val |= 1UL << bit;
167
        } else {
168
            val &= ~(1UL << bit);
169 170 171 172 173 174 175 176 177 178 179 180 181 182
        }
        bmds->aio_bitmap[idx] = val;
    }
}

static void alloc_aio_bitmap(BlkMigDevState *bmds)
{
    BlockDriverState *bs = bmds->bs;
    int64_t bitmap_size;

    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;

183
    bmds->aio_bitmap = g_malloc0(bitmap_size);
184 185
}

L
lirans@il.ibm.com 已提交
186 187
static void blk_mig_read_cb(void *opaque, int ret)
{
188
    long double curr_time = qemu_get_clock_ns(rt_clock);
L
lirans@il.ibm.com 已提交
189
    BlkMigBlock *blk = opaque;
190

L
lirans@il.ibm.com 已提交
191
    blk->ret = ret;
192

193 194 195
    block_mig_state.reads++;
    block_mig_state.total_time += (curr_time - block_mig_state.prev_time_offset);
    block_mig_state.prev_time_offset = curr_time;
196

197
    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
198
    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
199

200 201 202
    block_mig_state.submitted--;
    block_mig_state.read_done++;
    assert(block_mig_state.submitted >= 0);
L
lirans@il.ibm.com 已提交
203 204
}

205
static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
L
Liran Schour 已提交
206
                                BlkMigDevState *bmds)
207
{
208 209 210
    int64_t total_sectors = bmds->total_sectors;
    int64_t cur_sector = bmds->cur_sector;
    BlockDriverState *bs = bmds->bs;
L
lirans@il.ibm.com 已提交
211
    BlkMigBlock *blk;
212
    int nr_sectors;
213

214
    if (bmds->shared_base) {
215
        while (cur_sector < total_sectors &&
216 217
               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
                                  &nr_sectors)) {
L
lirans@il.ibm.com 已提交
218 219 220
            cur_sector += nr_sectors;
        }
    }
221 222

    if (cur_sector >= total_sectors) {
223
        bmds->cur_sector = bmds->completed_sectors = total_sectors;
L
lirans@il.ibm.com 已提交
224 225
        return 1;
    }
226

227
    bmds->completed_sectors = cur_sector;
228

229 230
    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);

J
Jan Kiszka 已提交
231 232
    /* we are going to transfer a full block even if it is not allocated */
    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
L
lirans@il.ibm.com 已提交
233

J
Jan Kiszka 已提交
234
    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
235
        nr_sectors = total_sectors - cur_sector;
L
lirans@il.ibm.com 已提交
236
    }
237

238 239
    blk = g_malloc(sizeof(BlkMigBlock));
    blk->buf = g_malloc(BLOCK_SIZE);
240 241
    blk->bmds = bmds;
    blk->sector = cur_sector;
242
    blk->nr_sectors = nr_sectors;
243

L
Liran Schour 已提交
244 245 246
    blk->iov.iov_base = blk->buf;
    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
247

248 249 250
    if (block_mig_state.submitted == 0) {
        block_mig_state.prev_time_offset = qemu_get_clock_ns(rt_clock);
    }
251

L
Liran Schour 已提交
252 253 254 255
    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
                                nr_sectors, blk_mig_read_cb, blk);
    if (!blk->aiocb) {
        goto error;
L
lirans@il.ibm.com 已提交
256
    }
L
Liran Schour 已提交
257
    block_mig_state.submitted++;
258

259 260
    bdrv_reset_dirty(bs, cur_sector, nr_sectors);
    bmds->cur_sector = cur_sector + nr_sectors;
261

262
    return (bmds->cur_sector >= total_sectors);
263 264

error:
265
    monitor_printf(mon, "Error reading sector %" PRId64 "\n", cur_sector);
266
    qemu_file_set_error(f, -EIO);
267 268
    g_free(blk->buf);
    g_free(blk);
269
    return 0;
L
lirans@il.ibm.com 已提交
270 271 272 273 274
}

static void set_dirty_tracking(int enable)
{
    BlkMigDevState *bmds;
275 276

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
277
        bdrv_set_dirty_tracking(bmds->bs, enable);
L
lirans@il.ibm.com 已提交
278 279 280
    }
}

281
static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
L
lirans@il.ibm.com 已提交
282
{
283
    Monitor *mon = opaque;
284
    BlkMigDevState *bmds;
285
    int64_t sectors;
286

287
    if (!bdrv_is_read_only(bs)) {
288
        sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
289
        if (sectors <= 0) {
290 291 292
            return;
        }

293
        bmds = g_malloc0(sizeof(BlkMigDevState));
294 295 296 297 298
        bmds->bs = bs;
        bmds->bulk_completed = 0;
        bmds->total_sectors = sectors;
        bmds->completed_sectors = 0;
        bmds->shared_base = block_mig_state.shared_base;
299
        alloc_aio_bitmap(bmds);
300
        drive_get_ref(drive_get_by_blockdev(bs));
M
Marcelo Tosatti 已提交
301
        bdrv_set_in_use(bs, 1);
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319

        block_mig_state.total_sector_sum += sectors;

        if (bmds->shared_base) {
            monitor_printf(mon, "Start migration for %s with shared base "
                                "image\n",
                           bs->device_name);
        } else {
            monitor_printf(mon, "Start full migration for %s\n",
                           bs->device_name);
        }

        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
    }
}

static void init_blk_migration(Monitor *mon, QEMUFile *f)
{
320 321 322
    block_mig_state.submitted = 0;
    block_mig_state.read_done = 0;
    block_mig_state.transferred = 0;
323
    block_mig_state.total_sector_sum = 0;
324
    block_mig_state.prev_progress = -1;
L
Liran Schour 已提交
325
    block_mig_state.bulk_completed = 0;
326 327
    block_mig_state.total_time = 0;
    block_mig_state.reads = 0;
328

329
    bdrv_iterate(init_blk_migration_it, mon);
L
lirans@il.ibm.com 已提交
330 331
}

L
Liran Schour 已提交
332
static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f)
L
lirans@il.ibm.com 已提交
333
{
334
    int64_t completed_sector_sum = 0;
L
lirans@il.ibm.com 已提交
335
    BlkMigDevState *bmds;
336
    int progress;
337
    int ret = 0;
L
lirans@il.ibm.com 已提交
338

339
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
340
        if (bmds->bulk_completed == 0) {
L
Liran Schour 已提交
341
            if (mig_save_device_bulk(mon, f, bmds) == 1) {
342 343
                /* completed bulk section for this device */
                bmds->bulk_completed = 1;
L
lirans@il.ibm.com 已提交
344
            }
345 346 347 348 349
            completed_sector_sum += bmds->completed_sectors;
            ret = 1;
            break;
        } else {
            completed_sector_sum += bmds->completed_sectors;
L
lirans@il.ibm.com 已提交
350 351
        }
    }
352

353 354 355 356 357 358
    if (block_mig_state.total_sector_sum != 0) {
        progress = completed_sector_sum * 100 /
                   block_mig_state.total_sector_sum;
    } else {
        progress = 100;
    }
359 360 361 362 363
    if (progress != block_mig_state.prev_progress) {
        block_mig_state.prev_progress = progress;
        qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
                         | BLK_MIG_FLAG_PROGRESS);
        monitor_printf(mon, "Completed %d %%\r", progress);
364
        monitor_flush(mon);
365 366 367
    }

    return ret;
L
lirans@il.ibm.com 已提交
368 369
}

370
static void blk_mig_reset_dirty_cursor(void)
L
lirans@il.ibm.com 已提交
371 372
{
    BlkMigDevState *bmds;
373 374 375 376 377 378 379 380 381 382 383

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        bmds->cur_dirty = 0;
    }
}

static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
                                 BlkMigDevState *bmds, int is_async)
{
    BlkMigBlock *blk;
    int64_t total_sectors = bmds->total_sectors;
L
lirans@il.ibm.com 已提交
384
    int64_t sector;
385
    int nr_sectors;
386
    int ret = -EIO;
387

388
    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
389
        if (bmds_aio_inflight(bmds, sector)) {
390
            qemu_aio_flush();
391
        }
392
        if (bdrv_get_dirty(bmds->bs, sector)) {
393

394 395 396 397 398
            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
                nr_sectors = total_sectors - sector;
            } else {
                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
            }
399 400
            blk = g_malloc(sizeof(BlkMigBlock));
            blk->buf = g_malloc(BLOCK_SIZE);
401 402
            blk->bmds = bmds;
            blk->sector = sector;
403
            blk->nr_sectors = nr_sectors;
404

405
            if (is_async) {
406 407 408 409
                blk->iov.iov_base = blk->buf;
                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);

410 411 412
                if (block_mig_state.submitted == 0) {
                    block_mig_state.prev_time_offset = qemu_get_clock_ns(rt_clock);
                }
413

414 415 416 417 418 419
                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
                                            nr_sectors, blk_mig_read_cb, blk);
                if (!blk->aiocb) {
                    goto error;
                }
                block_mig_state.submitted++;
420
                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
421
            } else {
422 423
                ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
                if (ret < 0) {
424
                    goto error;
L
lirans@il.ibm.com 已提交
425
                }
426
                blk_send(f, blk);
427

428 429
                g_free(blk->buf);
                g_free(blk);
430
            }
431 432 433

            bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
            break;
L
lirans@il.ibm.com 已提交
434
        }
435 436
        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
        bmds->cur_dirty = sector;
L
lirans@il.ibm.com 已提交
437
    }
438

439 440
    return (bmds->cur_dirty >= bmds->total_sectors);

441
error:
442
    monitor_printf(mon, "Error reading sector %" PRId64 "\n", sector);
443
    qemu_file_set_error(f, ret);
444 445
    g_free(blk->buf);
    g_free(blk);
446 447 448 449 450 451 452 453 454
    return 0;
}

static int blk_mig_save_dirty_block(Monitor *mon, QEMUFile *f, int is_async)
{
    BlkMigDevState *bmds;
    int ret = 0;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
455
        if (mig_save_device_dirty(mon, f, bmds, is_async) == 0) {
456 457 458 459 460 461
            ret = 1;
            break;
        }
    }

    return ret;
L
lirans@il.ibm.com 已提交
462 463 464 465
}

static void flush_blks(QEMUFile* f)
{
466
    BlkMigBlock *blk;
467

M
malc 已提交
468
    DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
469 470
            __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
            block_mig_state.transferred);
471

472 473 474 475
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
        if (qemu_file_rate_limit(f)) {
            break;
        }
476
        if (blk->ret < 0) {
477
            qemu_file_set_error(f, blk->ret);
478 479
            break;
        }
480
        blk_send(f, blk);
481

482
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
483 484
        g_free(blk->buf);
        g_free(blk);
485

486 487 488
        block_mig_state.read_done--;
        block_mig_state.transferred++;
        assert(block_mig_state.read_done >= 0);
L
lirans@il.ibm.com 已提交
489 490
    }

M
malc 已提交
491
    DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
492 493
            block_mig_state.submitted, block_mig_state.read_done,
            block_mig_state.transferred);
L
lirans@il.ibm.com 已提交
494 495
}

496 497 498 499 500 501 502 503 504 505 506 507
static int64_t get_remaining_dirty(void)
{
    BlkMigDevState *bmds;
    int64_t dirty = 0;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        dirty += bdrv_get_dirty_count(bmds->bs);
    }

    return dirty * BLOCK_SIZE;
}

L
lirans@il.ibm.com 已提交
508 509
static int is_stage2_completed(void)
{
510 511 512 513 514 515
    int64_t remaining_dirty;
    long double bwidth;

    if (block_mig_state.bulk_completed == 1) {

        remaining_dirty = get_remaining_dirty();
516 517 518
        if (remaining_dirty == 0) {
            return 1;
        }
519

520
        bwidth = compute_read_bwidth();
521

522
        if ((remaining_dirty / bwidth) <=
523 524 525 526 527 528 529 530 531
            migrate_max_downtime()) {
            /* finish stage2 because we think that we can finish remaing work
               below max_downtime */

            return 1;
        }
    }

    return 0;
L
lirans@il.ibm.com 已提交
532 533
}

534
static void blk_mig_cleanup(Monitor *mon)
535
{
536 537
    BlkMigDevState *bmds;
    BlkMigBlock *blk;
538

539 540
    set_dirty_tracking(0);

541 542
    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
M
Marcelo Tosatti 已提交
543
        bdrv_set_in_use(bmds->bs, 0);
544
        drive_put_ref(drive_get_by_blockdev(bmds->bs));
545 546
        g_free(bmds->aio_bitmap);
        g_free(bmds);
547 548
    }

549 550
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
551 552
        g_free(blk->buf);
        g_free(blk);
553 554
    }

555
    monitor_printf(mon, "\n");
556 557
}

558
static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
L
lirans@il.ibm.com 已提交
559
{
M
malc 已提交
560
    DPRINTF("Enter save live stage %d submitted %d transferred %d\n",
561
            stage, block_mig_state.submitted, block_mig_state.transferred);
562

563
    if (stage < 0) {
564
        blk_mig_cleanup(mon);
565 566 567
        return 0;
    }

568
    if (block_mig_state.blk_enable != 1) {
L
lirans@il.ibm.com 已提交
569
        /* no need to migrate storage */
570
        qemu_put_be64(f, BLK_MIG_FLAG_EOS);
L
lirans@il.ibm.com 已提交
571 572
        return 1;
    }
573 574

    if (stage == 1) {
575
        init_blk_migration(mon, f);
576

L
lirans@il.ibm.com 已提交
577 578 579 580 581
        /* start track dirty blocks */
        set_dirty_tracking(1);
    }

    flush_blks(f);
582

583
    if (qemu_file_get_error(f)) {
584
        blk_mig_cleanup(mon);
585 586 587
        return 0;
    }

588 589
    blk_mig_reset_dirty_cursor();

590
    if (stage == 2) {
591 592 593 594 595 596 597
        /* control the rate of transfer */
        while ((block_mig_state.submitted +
                block_mig_state.read_done) * BLOCK_SIZE <
               qemu_file_get_rate_limit(f)) {
            if (block_mig_state.bulk_completed == 0) {
                /* first finish the bulk phase */
                if (blk_mig_save_bulked_block(mon, f) == 0) {
598
                    /* finished saving bulk on all devices */
599 600 601 602 603 604 605 606
                    block_mig_state.bulk_completed = 1;
                }
            } else {
                if (blk_mig_save_dirty_block(mon, f, 1) == 0) {
                    /* no more dirty blocks */
                    break;
                }
            }
607 608
        }

609
        flush_blks(f);
610

611
        if (qemu_file_get_error(f)) {
612 613 614
            blk_mig_cleanup(mon);
            return 0;
        }
615 616
    }

617
    if (stage == 3) {
618 619 620
        /* we know for sure that save bulk is completed and
           all async read completed */
        assert(block_mig_state.submitted == 0);
621

622
        while (blk_mig_save_dirty_block(mon, f, 0) != 0);
623
        blk_mig_cleanup(mon);
624

625 626 627
        /* report completion */
        qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);

628
        if (qemu_file_get_error(f)) {
629 630 631
            return 0;
        }

632
        monitor_printf(mon, "Block migration completed\n");
L
lirans@il.ibm.com 已提交
633
    }
634 635 636

    qemu_put_be64(f, BLK_MIG_FLAG_EOS);

L
lirans@il.ibm.com 已提交
637 638 639 640 641
    return ((stage == 2) && is_stage2_completed());
}

static int block_load(QEMUFile *f, void *opaque, int version_id)
{
642
    static int banner_printed;
L
lirans@il.ibm.com 已提交
643 644 645
    int len, flags;
    char device_name[256];
    int64_t addr;
646
    BlockDriverState *bs, *bs_prev = NULL;
L
lirans@il.ibm.com 已提交
647
    uint8_t *buf;
648 649
    int64_t total_sectors = 0;
    int nr_sectors;
650

L
lirans@il.ibm.com 已提交
651 652
    do {
        addr = qemu_get_be64(f);
653

J
Jan Kiszka 已提交
654 655
        flags = addr & ~BDRV_SECTOR_MASK;
        addr >>= BDRV_SECTOR_BITS;
656 657

        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
658
            int ret;
L
lirans@il.ibm.com 已提交
659 660 661 662
            /* get device name */
            len = qemu_get_byte(f);
            qemu_get_buffer(f, (uint8_t *)device_name, len);
            device_name[len] = '\0';
663

L
lirans@il.ibm.com 已提交
664
            bs = bdrv_find(device_name);
665 666 667 668 669
            if (!bs) {
                fprintf(stderr, "Error unknown block device %s\n",
                        device_name);
                return -EINVAL;
            }
670

671 672 673 674
            if (bs != bs_prev) {
                bs_prev = bs;
                total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
                if (total_sectors <= 0) {
675
                    error_report("Error getting length of block device %s",
676 677 678 679 680 681 682 683 684 685 686
                                 device_name);
                    return -EINVAL;
                }
            }

            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
                nr_sectors = total_sectors - addr;
            } else {
                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
            }

687
            buf = g_malloc(BLOCK_SIZE);
688

689
            qemu_get_buffer(f, buf, BLOCK_SIZE);
690
            ret = bdrv_write(bs, addr, buf, nr_sectors);
691

692
            g_free(buf);
693 694 695
            if (ret < 0) {
                return ret;
            }
696 697 698 699 700 701 702 703
        } else if (flags & BLK_MIG_FLAG_PROGRESS) {
            if (!banner_printed) {
                printf("Receiving block device images\n");
                banner_printed = 1;
            }
            printf("Completed %d %%%c", (int)addr,
                   (addr == 100) ? '\n' : '\r');
            fflush(stdout);
704
        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
705 706 707
            fprintf(stderr, "Unknown flags\n");
            return -EINVAL;
        }
708
        if (qemu_file_get_error(f)) {
709
            return -EIO;
L
lirans@il.ibm.com 已提交
710
        }
711 712
    } while (!(flags & BLK_MIG_FLAG_EOS));

L
lirans@il.ibm.com 已提交
713 714 715 716 717
    return 0;
}

static void block_set_params(int blk_enable, int shared_base, void *opaque)
{
718 719
    block_mig_state.blk_enable = blk_enable;
    block_mig_state.shared_base = shared_base;
720

L
lirans@il.ibm.com 已提交
721
    /* shared base means that blk_enable = 1 */
722
    block_mig_state.blk_enable |= shared_base;
L
lirans@il.ibm.com 已提交
723 724 725
}

void blk_mig_init(void)
726
{
727 728 729
    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
    QSIMPLEQ_INIT(&block_mig_state.blk_list);

A
Alex Williamson 已提交
730 731
    register_savevm_live(NULL, "block", 0, 1, block_set_params,
                         block_save_live, NULL, block_load, &block_mig_state);
L
lirans@il.ibm.com 已提交
732
}