block-migration.c 22.5 KB
Newer Older
L
lirans@il.ibm.com 已提交
1 2 3 4 5 6 7 8 9 10 11
/*
 * QEMU live block migration
 *
 * Copyright IBM, Corp. 2009
 *
 * Authors:
 *  Liran Schour   <lirans@il.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
12 13
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
L
lirans@il.ibm.com 已提交
14 15 16
 */

#include "qemu-common.h"
17
#include "block/block_int.h"
L
lirans@il.ibm.com 已提交
18
#include "hw/hw.h"
19 20
#include "qemu/queue.h"
#include "qemu/timer.h"
21 22
#include "migration/block.h"
#include "migration/migration.h"
23
#include "sysemu/blockdev.h"
L
lirans@il.ibm.com 已提交
24 25
#include <assert.h>

26 27
#define BLOCK_SIZE                       (1 << 20)
#define BDRV_SECTORS_PER_DIRTY_CHUNK     (BLOCK_SIZE >> BDRV_SECTOR_BITS)
L
lirans@il.ibm.com 已提交
28 29 30

#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
#define BLK_MIG_FLAG_EOS                0x02
31
#define BLK_MIG_FLAG_PROGRESS           0x04
32
#define BLK_MIG_FLAG_ZERO_BLOCK         0x08
L
lirans@il.ibm.com 已提交
33 34 35 36 37 38

#define MAX_IS_ALLOCATED_SEARCH 65536

//#define DEBUG_BLK_MIGRATION

#ifdef DEBUG_BLK_MIGRATION
M
malc 已提交
39
#define DPRINTF(fmt, ...) \
L
lirans@il.ibm.com 已提交
40 41
    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
#else
M
malc 已提交
42
#define DPRINTF(fmt, ...) \
L
lirans@il.ibm.com 已提交
43 44 45
    do { } while (0)
#endif

46
typedef struct BlkMigDevState {
47
    /* Written during setup phase.  Can be read without a lock.  */
48 49 50
    BlockDriverState *bs;
    int shared_base;
    int64_t total_sectors;
51
    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
52 53 54 55 56 57

    /* Only used by migration thread.  Does not need a lock.  */
    int bulk_completed;
    int64_t cur_sector;
    int64_t cur_dirty;

P
Paolo Bonzini 已提交
58
    /* Protected by block migration lock.  */
59
    unsigned long *aio_bitmap;
60
    int64_t completed_sectors;
61 62
} BlkMigDevState;

L
lirans@il.ibm.com 已提交
63
typedef struct BlkMigBlock {
64
    /* Only used by migration thread.  */
L
lirans@il.ibm.com 已提交
65 66 67
    uint8_t *buf;
    BlkMigDevState *bmds;
    int64_t sector;
68
    int nr_sectors;
L
lirans@il.ibm.com 已提交
69 70 71
    struct iovec iov;
    QEMUIOVector qiov;
    BlockDriverAIOCB *aiocb;
72

P
Paolo Bonzini 已提交
73
    /* Protected by block migration lock.  */
L
lirans@il.ibm.com 已提交
74
    int ret;
75
    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
L
lirans@il.ibm.com 已提交
76 77 78
} BlkMigBlock;

typedef struct BlkMigState {
79
    /* Written during setup phase.  Can be read without a lock.  */
L
lirans@il.ibm.com 已提交
80 81
    int blk_enable;
    int shared_base;
82
    QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
83
    int64_t total_sector_sum;
84
    bool zero_blocks;
85

P
Paolo Bonzini 已提交
86
    /* Protected by lock.  */
87
    QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
L
lirans@il.ibm.com 已提交
88 89
    int submitted;
    int read_done;
90 91

    /* Only used by migration thread.  Does not need a lock.  */
L
lirans@il.ibm.com 已提交
92
    int transferred;
93
    int prev_progress;
L
Liran Schour 已提交
94
    int bulk_completed;
P
Paolo Bonzini 已提交
95 96 97

    /* Lock must be taken _inside_ the iothread lock.  */
    QemuMutex lock;
L
lirans@il.ibm.com 已提交
98 99
} BlkMigState;

100
static BlkMigState block_mig_state;
L
lirans@il.ibm.com 已提交
101

P
Paolo Bonzini 已提交
102 103 104 105 106 107 108 109 110 111
static void blk_mig_lock(void)
{
    qemu_mutex_lock(&block_mig_state.lock);
}

static void blk_mig_unlock(void)
{
    qemu_mutex_unlock(&block_mig_state.lock);
}

112 113 114 115
/* Must run outside of the iothread lock during the bulk phase,
 * or the VM will stall.
 */

116 117 118
static void blk_send(QEMUFile *f, BlkMigBlock * blk)
{
    int len;
119 120 121 122 123 124
    uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;

    if (block_mig_state.zero_blocks &&
        buffer_is_zero(blk->buf, BLOCK_SIZE)) {
        flags |= BLK_MIG_FLAG_ZERO_BLOCK;
    }
125 126 127

    /* sector number and flags */
    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
128
                     | flags);
129 130 131 132 133 134

    /* device name */
    len = strlen(blk->bmds->bs->device_name);
    qemu_put_byte(f, len);
    qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);

135 136 137 138 139 140 141 142
    /* if a block is zero we need to flush here since the network
     * bandwidth is now a lot higher than the storage device bandwidth.
     * thus if we queue zero blocks we slow down the migration */
    if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
        qemu_fflush(f);
        return;
    }

143 144 145
    qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
}

146 147 148 149 150 151 152 153 154 155
int blk_mig_active(void)
{
    return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
}

uint64_t blk_mig_bytes_transferred(void)
{
    BlkMigDevState *bmds;
    uint64_t sum = 0;

P
Paolo Bonzini 已提交
156
    blk_mig_lock();
157 158 159
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        sum += bmds->completed_sectors;
    }
P
Paolo Bonzini 已提交
160
    blk_mig_unlock();
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
    return sum << BDRV_SECTOR_BITS;
}

uint64_t blk_mig_bytes_remaining(void)
{
    return blk_mig_bytes_total() - blk_mig_bytes_transferred();
}

uint64_t blk_mig_bytes_total(void)
{
    BlkMigDevState *bmds;
    uint64_t sum = 0;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        sum += bmds->total_sectors;
    }
    return sum << BDRV_SECTOR_BITS;
}

P
Paolo Bonzini 已提交
180 181 182

/* Called with migration lock held.  */

183 184 185 186
static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
{
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;

187
    if ((sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
188 189 190 191 192 193 194
        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
    } else {
        return 0;
    }
}

P
Paolo Bonzini 已提交
195 196
/* Called with migration lock held.  */

197 198 199 200 201 202 203 204 205 206 207 208 209 210
static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
                             int nb_sectors, int set)
{
    int64_t start, end;
    unsigned long val, idx, bit;

    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;

    for (; start <= end; start++) {
        idx = start / (sizeof(unsigned long) * 8);
        bit = start % (sizeof(unsigned long) * 8);
        val = bmds->aio_bitmap[idx];
        if (set) {
211
            val |= 1UL << bit;
212
        } else {
213
            val &= ~(1UL << bit);
214 215 216 217 218 219 220 221 222 223 224 225 226 227
        }
        bmds->aio_bitmap[idx] = val;
    }
}

static void alloc_aio_bitmap(BlkMigDevState *bmds)
{
    BlockDriverState *bs = bmds->bs;
    int64_t bitmap_size;

    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;

228
    bmds->aio_bitmap = g_malloc0(bitmap_size);
229 230
}

P
Paolo Bonzini 已提交
231 232
/* Never hold migration lock when yielding to the main loop!  */

L
lirans@il.ibm.com 已提交
233 234 235
static void blk_mig_read_cb(void *opaque, int ret)
{
    BlkMigBlock *blk = opaque;
236

P
Paolo Bonzini 已提交
237
    blk_mig_lock();
L
lirans@il.ibm.com 已提交
238
    blk->ret = ret;
239

240
    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
241
    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
242

243 244 245
    block_mig_state.submitted--;
    block_mig_state.read_done++;
    assert(block_mig_state.submitted >= 0);
P
Paolo Bonzini 已提交
246
    blk_mig_unlock();
L
lirans@il.ibm.com 已提交
247 248
}

249 250
/* Called with no lock taken.  */

251
static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
252
{
253 254 255
    int64_t total_sectors = bmds->total_sectors;
    int64_t cur_sector = bmds->cur_sector;
    BlockDriverState *bs = bmds->bs;
L
lirans@il.ibm.com 已提交
256
    BlkMigBlock *blk;
257
    int nr_sectors;
258

259
    if (bmds->shared_base) {
260
        qemu_mutex_lock_iothread();
261
        while (cur_sector < total_sectors &&
262 263
               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
                                  &nr_sectors)) {
L
lirans@il.ibm.com 已提交
264 265
            cur_sector += nr_sectors;
        }
266
        qemu_mutex_unlock_iothread();
L
lirans@il.ibm.com 已提交
267
    }
268 269

    if (cur_sector >= total_sectors) {
270
        bmds->cur_sector = bmds->completed_sectors = total_sectors;
L
lirans@il.ibm.com 已提交
271 272
        return 1;
    }
273

274
    bmds->completed_sectors = cur_sector;
275

276 277
    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);

J
Jan Kiszka 已提交
278 279
    /* we are going to transfer a full block even if it is not allocated */
    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
L
lirans@il.ibm.com 已提交
280

J
Jan Kiszka 已提交
281
    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
282
        nr_sectors = total_sectors - cur_sector;
L
lirans@il.ibm.com 已提交
283
    }
284

285 286
    blk = g_malloc(sizeof(BlkMigBlock));
    blk->buf = g_malloc(BLOCK_SIZE);
287 288
    blk->bmds = bmds;
    blk->sector = cur_sector;
289
    blk->nr_sectors = nr_sectors;
290

L
Liran Schour 已提交
291 292 293
    blk->iov.iov_base = blk->buf;
    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
294

P
Paolo Bonzini 已提交
295
    blk_mig_lock();
296
    block_mig_state.submitted++;
P
Paolo Bonzini 已提交
297
    blk_mig_unlock();
298

299
    qemu_mutex_lock_iothread();
L
Liran Schour 已提交
300 301
    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
                                nr_sectors, blk_mig_read_cb, blk);
302

303
    bdrv_reset_dirty(bs, cur_sector, nr_sectors);
304
    qemu_mutex_unlock_iothread();
305

306
    bmds->cur_sector = cur_sector + nr_sectors;
307
    return (bmds->cur_sector >= total_sectors);
L
lirans@il.ibm.com 已提交
308 309
}

310 311
/* Called with iothread lock taken.  */

L
lirans@il.ibm.com 已提交
312 313 314
static void set_dirty_tracking(int enable)
{
    BlkMigDevState *bmds;
315 316

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
317
        bdrv_set_dirty_tracking(bmds->bs, enable ? BLOCK_SIZE : 0);
L
lirans@il.ibm.com 已提交
318 319 320
    }
}

321
static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
L
lirans@il.ibm.com 已提交
322
{
323
    BlkMigDevState *bmds;
324
    int64_t sectors;
325

326
    if (!bdrv_is_read_only(bs)) {
327
        sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
328
        if (sectors <= 0) {
329 330 331
            return;
        }

332
        bmds = g_malloc0(sizeof(BlkMigDevState));
333 334 335 336 337
        bmds->bs = bs;
        bmds->bulk_completed = 0;
        bmds->total_sectors = sectors;
        bmds->completed_sectors = 0;
        bmds->shared_base = block_mig_state.shared_base;
338
        alloc_aio_bitmap(bmds);
339
        drive_get_ref(drive_get_by_blockdev(bs));
M
Marcelo Tosatti 已提交
340
        bdrv_set_in_use(bs, 1);
341 342 343 344

        block_mig_state.total_sector_sum += sectors;

        if (bmds->shared_base) {
345 346
            DPRINTF("Start migration for %s with shared base image\n",
                    bs->device_name);
347
        } else {
348
            DPRINTF("Start full migration for %s\n", bs->device_name);
349 350 351 352 353 354
        }

        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
    }
}

355
static void init_blk_migration(QEMUFile *f)
356
{
357 358 359
    block_mig_state.submitted = 0;
    block_mig_state.read_done = 0;
    block_mig_state.transferred = 0;
360
    block_mig_state.total_sector_sum = 0;
361
    block_mig_state.prev_progress = -1;
L
Liran Schour 已提交
362
    block_mig_state.bulk_completed = 0;
363
    block_mig_state.zero_blocks = migrate_zero_blocks();
364

365
    bdrv_iterate(init_blk_migration_it, NULL);
L
lirans@il.ibm.com 已提交
366 367
}

368 369
/* Called with no lock taken.  */

370
static int blk_mig_save_bulked_block(QEMUFile *f)
L
lirans@il.ibm.com 已提交
371
{
372
    int64_t completed_sector_sum = 0;
L
lirans@il.ibm.com 已提交
373
    BlkMigDevState *bmds;
374
    int progress;
375
    int ret = 0;
L
lirans@il.ibm.com 已提交
376

377
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
378
        if (bmds->bulk_completed == 0) {
379
            if (mig_save_device_bulk(f, bmds) == 1) {
380 381
                /* completed bulk section for this device */
                bmds->bulk_completed = 1;
L
lirans@il.ibm.com 已提交
382
            }
383 384 385 386 387
            completed_sector_sum += bmds->completed_sectors;
            ret = 1;
            break;
        } else {
            completed_sector_sum += bmds->completed_sectors;
L
lirans@il.ibm.com 已提交
388 389
        }
    }
390

391 392 393 394 395 396
    if (block_mig_state.total_sector_sum != 0) {
        progress = completed_sector_sum * 100 /
                   block_mig_state.total_sector_sum;
    } else {
        progress = 100;
    }
397 398 399 400
    if (progress != block_mig_state.prev_progress) {
        block_mig_state.prev_progress = progress;
        qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
                         | BLK_MIG_FLAG_PROGRESS);
401
        DPRINTF("Completed %d %%\r", progress);
402 403 404
    }

    return ret;
L
lirans@il.ibm.com 已提交
405 406
}

407
static void blk_mig_reset_dirty_cursor(void)
L
lirans@il.ibm.com 已提交
408 409
{
    BlkMigDevState *bmds;
410 411 412 413 414 415

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        bmds->cur_dirty = 0;
    }
}

416 417
/* Called with iothread lock taken.  */

418 419
static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
                                 int is_async)
420 421 422
{
    BlkMigBlock *blk;
    int64_t total_sectors = bmds->total_sectors;
L
lirans@il.ibm.com 已提交
423
    int64_t sector;
424
    int nr_sectors;
425
    int ret = -EIO;
426

427
    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
P
Paolo Bonzini 已提交
428
        blk_mig_lock();
429
        if (bmds_aio_inflight(bmds, sector)) {
P
Paolo Bonzini 已提交
430
            blk_mig_unlock();
431
            bdrv_drain_all();
P
Paolo Bonzini 已提交
432 433
        } else {
            blk_mig_unlock();
434
        }
435
        if (bdrv_get_dirty(bmds->bs, sector)) {
436

437 438 439 440 441
            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
                nr_sectors = total_sectors - sector;
            } else {
                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
            }
442 443
            blk = g_malloc(sizeof(BlkMigBlock));
            blk->buf = g_malloc(BLOCK_SIZE);
444 445
            blk->bmds = bmds;
            blk->sector = sector;
446
            blk->nr_sectors = nr_sectors;
447

448
            if (is_async) {
449 450 451 452 453 454
                blk->iov.iov_base = blk->buf;
                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);

                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
                                            nr_sectors, blk_mig_read_cb, blk);
P
Paolo Bonzini 已提交
455 456

                blk_mig_lock();
457
                block_mig_state.submitted++;
458
                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
P
Paolo Bonzini 已提交
459
                blk_mig_unlock();
460
            } else {
461 462
                ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
                if (ret < 0) {
463
                    goto error;
L
lirans@il.ibm.com 已提交
464
                }
465
                blk_send(f, blk);
466

467 468
                g_free(blk->buf);
                g_free(blk);
469
            }
470 471 472

            bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
            break;
L
lirans@il.ibm.com 已提交
473
        }
474 475
        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
        bmds->cur_dirty = sector;
L
lirans@il.ibm.com 已提交
476
    }
477

478 479
    return (bmds->cur_dirty >= bmds->total_sectors);

480
error:
481
    DPRINTF("Error reading sector %" PRId64 "\n", sector);
482 483
    g_free(blk->buf);
    g_free(blk);
484
    return ret;
485 486
}

487 488 489
/* Called with iothread lock taken.
 *
 * return value:
490 491 492
 * 0: too much data for max_downtime
 * 1: few enough data for max_downtime
*/
493
static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
494 495
{
    BlkMigDevState *bmds;
496
    int ret = 1;
497 498

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
499
        ret = mig_save_device_dirty(f, bmds, is_async);
500
        if (ret <= 0) {
501 502 503 504 505
            break;
        }
    }

    return ret;
L
lirans@il.ibm.com 已提交
506 507
}

508 509
/* Called with no locks taken.  */

510
static int flush_blks(QEMUFile *f)
L
lirans@il.ibm.com 已提交
511
{
512
    BlkMigBlock *blk;
513
    int ret = 0;
514

M
malc 已提交
515
    DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
516 517
            __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
            block_mig_state.transferred);
518

P
Paolo Bonzini 已提交
519
    blk_mig_lock();
520 521 522 523
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
        if (qemu_file_rate_limit(f)) {
            break;
        }
524
        if (blk->ret < 0) {
525
            ret = blk->ret;
526 527
            break;
        }
528

529
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
P
Paolo Bonzini 已提交
530
        blk_mig_unlock();
531
        blk_send(f, blk);
P
Paolo Bonzini 已提交
532
        blk_mig_lock();
533

534 535
        g_free(blk->buf);
        g_free(blk);
536

537 538 539
        block_mig_state.read_done--;
        block_mig_state.transferred++;
        assert(block_mig_state.read_done >= 0);
L
lirans@il.ibm.com 已提交
540
    }
P
Paolo Bonzini 已提交
541
    blk_mig_unlock();
L
lirans@il.ibm.com 已提交
542

M
malc 已提交
543
    DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
544 545
            block_mig_state.submitted, block_mig_state.read_done,
            block_mig_state.transferred);
546
    return ret;
L
lirans@il.ibm.com 已提交
547 548
}

549 550
/* Called with iothread lock taken.  */

551 552 553 554 555 556 557 558 559
static int64_t get_remaining_dirty(void)
{
    BlkMigDevState *bmds;
    int64_t dirty = 0;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        dirty += bdrv_get_dirty_count(bmds->bs);
    }

560
    return dirty << BDRV_SECTOR_BITS;
561 562
}

563 564
/* Called with iothread lock taken.  */

565
static void blk_mig_cleanup(void)
566
{
567 568
    BlkMigDevState *bmds;
    BlkMigBlock *blk;
569

570 571
    bdrv_drain_all();

572 573
    set_dirty_tracking(0);

P
Paolo Bonzini 已提交
574
    blk_mig_lock();
575 576
    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
M
Marcelo Tosatti 已提交
577
        bdrv_set_in_use(bmds->bs, 0);
578
        drive_put_ref(drive_get_by_blockdev(bmds->bs));
579 580
        g_free(bmds->aio_bitmap);
        g_free(bmds);
581 582
    }

583 584
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
585 586
        g_free(blk->buf);
        g_free(blk);
587
    }
P
Paolo Bonzini 已提交
588
    blk_mig_unlock();
589 590
}

591 592 593 594 595
static void block_migration_cancel(void *opaque)
{
    blk_mig_cleanup();
}

596
static int block_save_setup(QEMUFile *f, void *opaque)
L
lirans@il.ibm.com 已提交
597
{
598 599
    int ret;

600 601
    DPRINTF("Enter save live setup submitted %d transferred %d\n",
            block_mig_state.submitted, block_mig_state.transferred);
602

603
    qemu_mutex_lock_iothread();
604 605 606 607
    init_blk_migration(f);

    /* start track dirty blocks */
    set_dirty_tracking(1);
608
    qemu_mutex_unlock_iothread();
609

610
    ret = flush_blks(f);
611 612 613
    blk_mig_reset_dirty_cursor();
    qemu_put_be64(f, BLK_MIG_FLAG_EOS);

614
    return ret;
615 616
}

617
static int block_save_iterate(QEMUFile *f, void *opaque)
618 619
{
    int ret;
620
    int64_t last_ftell = qemu_ftell(f);
621

622 623
    DPRINTF("Enter save live iterate submitted %d transferred %d\n",
            block_mig_state.submitted, block_mig_state.transferred);
624

625
    ret = flush_blks(f);
626 627
    if (ret) {
        return ret;
628 629
    }

630 631
    blk_mig_reset_dirty_cursor();

632
    /* control the rate of transfer */
P
Paolo Bonzini 已提交
633
    blk_mig_lock();
634 635 636
    while ((block_mig_state.submitted +
            block_mig_state.read_done) * BLOCK_SIZE <
           qemu_file_get_rate_limit(f)) {
P
Paolo Bonzini 已提交
637
        blk_mig_unlock();
638 639 640 641 642 643
        if (block_mig_state.bulk_completed == 0) {
            /* first finish the bulk phase */
            if (blk_mig_save_bulked_block(f) == 0) {
                /* finished saving bulk on all devices */
                block_mig_state.bulk_completed = 1;
            }
644
            ret = 0;
645
        } else {
646 647 648 649
            /* Always called with iothread lock taken for
             * simplicity, block_save_complete also calls it.
             */
            qemu_mutex_lock_iothread();
650
            ret = blk_mig_save_dirty_block(f, 1);
651
            qemu_mutex_unlock_iothread();
652 653 654 655
        }
        if (ret < 0) {
            return ret;
        }
P
Paolo Bonzini 已提交
656
        blk_mig_lock();
657 658 659
        if (ret != 0) {
            /* no more dirty blocks */
            break;
660
        }
661
    }
P
Paolo Bonzini 已提交
662
    blk_mig_unlock();
663

664
    ret = flush_blks(f);
665 666
    if (ret) {
        return ret;
667 668
    }

669
    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
670
    return qemu_ftell(f) - last_ftell;
671 672
}

673 674
/* Called with iothread lock taken.  */

675 676 677 678 679 680 681
static int block_save_complete(QEMUFile *f, void *opaque)
{
    int ret;

    DPRINTF("Enter save live complete submitted %d transferred %d\n",
            block_mig_state.submitted, block_mig_state.transferred);

682
    ret = flush_blks(f);
683 684 685
    if (ret) {
        return ret;
    }
686

687
    blk_mig_reset_dirty_cursor();
688

689 690
    /* we know for sure that save bulk is completed and
       all async read completed */
P
Paolo Bonzini 已提交
691
    blk_mig_lock();
692
    assert(block_mig_state.submitted == 0);
P
Paolo Bonzini 已提交
693
    blk_mig_unlock();
694

695 696
    do {
        ret = blk_mig_save_dirty_block(f, 0);
697 698 699
        if (ret < 0) {
            return ret;
        }
700
    } while (ret == 0);
701

702 703
    /* report completion */
    qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
704

705 706
    DPRINTF("Block migration completed\n");

707 708
    qemu_put_be64(f, BLK_MIG_FLAG_EOS);

709
    blk_mig_cleanup();
710
    return 0;
L
lirans@il.ibm.com 已提交
711 712
}

713 714
static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
{
715
    /* Estimate pending number of bytes to send */
716 717
    uint64_t pending;

718
    qemu_mutex_lock_iothread();
P
Paolo Bonzini 已提交
719
    blk_mig_lock();
720
    pending = get_remaining_dirty() +
721 722 723 724 725 726 727
                       block_mig_state.submitted * BLOCK_SIZE +
                       block_mig_state.read_done * BLOCK_SIZE;

    /* Report at least one block pending during bulk phase */
    if (pending == 0 && !block_mig_state.bulk_completed) {
        pending = BLOCK_SIZE;
    }
P
Paolo Bonzini 已提交
728
    blk_mig_unlock();
729
    qemu_mutex_unlock_iothread();
730

731 732
    DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
    return pending;
733 734
}

L
lirans@il.ibm.com 已提交
735 736
static int block_load(QEMUFile *f, void *opaque, int version_id)
{
737
    static int banner_printed;
L
lirans@il.ibm.com 已提交
738 739 740
    int len, flags;
    char device_name[256];
    int64_t addr;
741
    BlockDriverState *bs, *bs_prev = NULL;
L
lirans@il.ibm.com 已提交
742
    uint8_t *buf;
743 744
    int64_t total_sectors = 0;
    int nr_sectors;
745
    int ret;
746

L
lirans@il.ibm.com 已提交
747 748
    do {
        addr = qemu_get_be64(f);
749

J
Jan Kiszka 已提交
750 751
        flags = addr & ~BDRV_SECTOR_MASK;
        addr >>= BDRV_SECTOR_BITS;
752 753

        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
L
lirans@il.ibm.com 已提交
754 755 756 757
            /* get device name */
            len = qemu_get_byte(f);
            qemu_get_buffer(f, (uint8_t *)device_name, len);
            device_name[len] = '\0';
758

L
lirans@il.ibm.com 已提交
759
            bs = bdrv_find(device_name);
760 761 762 763 764
            if (!bs) {
                fprintf(stderr, "Error unknown block device %s\n",
                        device_name);
                return -EINVAL;
            }
765

766 767 768 769
            if (bs != bs_prev) {
                bs_prev = bs;
                total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
                if (total_sectors <= 0) {
770
                    error_report("Error getting length of block device %s",
771 772 773 774 775 776 777 778 779 780 781
                                 device_name);
                    return -EINVAL;
                }
            }

            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
                nr_sectors = total_sectors - addr;
            } else {
                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
            }

782 783 784 785 786 787 788 789
            if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
                ret = bdrv_write_zeroes(bs, addr, nr_sectors);
            } else {
                buf = g_malloc(BLOCK_SIZE);
                qemu_get_buffer(f, buf, BLOCK_SIZE);
                ret = bdrv_write(bs, addr, buf, nr_sectors);
                g_free(buf);
            }
790

791 792 793
            if (ret < 0) {
                return ret;
            }
794 795 796 797 798 799 800 801
        } else if (flags & BLK_MIG_FLAG_PROGRESS) {
            if (!banner_printed) {
                printf("Receiving block device images\n");
                banner_printed = 1;
            }
            printf("Completed %d %%%c", (int)addr,
                   (addr == 100) ? '\n' : '\r');
            fflush(stdout);
802
        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
803
            fprintf(stderr, "Unknown block migration flags: %#x\n", flags);
804 805
            return -EINVAL;
        }
806 807 808
        ret = qemu_file_get_error(f);
        if (ret != 0) {
            return ret;
L
lirans@il.ibm.com 已提交
809
        }
810 811
    } while (!(flags & BLK_MIG_FLAG_EOS));

L
lirans@il.ibm.com 已提交
812 813 814
    return 0;
}

I
Isaku Yamahata 已提交
815
static void block_set_params(const MigrationParams *params, void *opaque)
L
lirans@il.ibm.com 已提交
816
{
I
Isaku Yamahata 已提交
817 818
    block_mig_state.blk_enable = params->blk;
    block_mig_state.shared_base = params->shared;
819

L
lirans@il.ibm.com 已提交
820
    /* shared base means that blk_enable = 1 */
I
Isaku Yamahata 已提交
821
    block_mig_state.blk_enable |= params->shared;
L
lirans@il.ibm.com 已提交
822 823
}

824 825 826 827 828
static bool block_is_active(void *opaque)
{
    return block_mig_state.blk_enable == 1;
}

829 830
SaveVMHandlers savevm_block_handlers = {
    .set_params = block_set_params,
831
    .save_live_setup = block_save_setup,
832 833
    .save_live_iterate = block_save_iterate,
    .save_live_complete = block_save_complete,
834
    .save_live_pending = block_save_pending,
835
    .load_state = block_load,
836
    .cancel = block_migration_cancel,
837
    .is_active = block_is_active,
838 839
};

L
lirans@il.ibm.com 已提交
840
void blk_mig_init(void)
841
{
842 843
    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
    QSIMPLEQ_INIT(&block_mig_state.blk_list);
P
Paolo Bonzini 已提交
844
    qemu_mutex_init(&block_mig_state.lock);
845

846 847
    register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
                         &block_mig_state);
L
lirans@il.ibm.com 已提交
848
}