block.c 24.1 KB
Newer Older
L
lirans@il.ibm.com 已提交
1 2 3 4 5 6 7 8 9 10 11
/*
 * QEMU live block migration
 *
 * Copyright IBM, Corp. 2009
 *
 * Authors:
 *  Liran Schour   <lirans@il.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
12 13
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
L
lirans@il.ibm.com 已提交
14 15
 */

P
Peter Maydell 已提交
16
#include "qemu/osdep.h"
L
lirans@il.ibm.com 已提交
17
#include "qemu-common.h"
18 19 20
#include "block/block.h"
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
L
lirans@il.ibm.com 已提交
21
#include "hw/hw.h"
22 23
#include "qemu/queue.h"
#include "qemu/timer.h"
24 25
#include "migration/block.h"
#include "migration/migration.h"
26
#include "sysemu/blockdev.h"
27
#include "sysemu/block-backend.h"
L
lirans@il.ibm.com 已提交
28

29 30
#define BLOCK_SIZE                       (1 << 20)
#define BDRV_SECTORS_PER_DIRTY_CHUNK     (BLOCK_SIZE >> BDRV_SECTOR_BITS)
L
lirans@il.ibm.com 已提交
31 32 33

#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
#define BLK_MIG_FLAG_EOS                0x02
34
#define BLK_MIG_FLAG_PROGRESS           0x04
35
#define BLK_MIG_FLAG_ZERO_BLOCK         0x08
L
lirans@il.ibm.com 已提交
36 37 38

#define MAX_IS_ALLOCATED_SEARCH 65536

39 40
#define MAX_INFLIGHT_IO 512

L
lirans@il.ibm.com 已提交
41 42 43
//#define DEBUG_BLK_MIGRATION

#ifdef DEBUG_BLK_MIGRATION
M
malc 已提交
44
#define DPRINTF(fmt, ...) \
L
lirans@il.ibm.com 已提交
45 46
    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
#else
M
malc 已提交
47
#define DPRINTF(fmt, ...) \
L
lirans@il.ibm.com 已提交
48 49 50
    do { } while (0)
#endif

51
typedef struct BlkMigDevState {
52
    /* Written during setup phase.  Can be read without a lock.  */
53 54 55
    BlockDriverState *bs;
    int shared_base;
    int64_t total_sectors;
56
    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
57 58 59 60 61 62

    /* Only used by migration thread.  Does not need a lock.  */
    int bulk_completed;
    int64_t cur_sector;
    int64_t cur_dirty;

P
Paolo Bonzini 已提交
63
    /* Protected by block migration lock.  */
64
    unsigned long *aio_bitmap;
65
    int64_t completed_sectors;
F
Fam Zheng 已提交
66
    BdrvDirtyBitmap *dirty_bitmap;
67
    Error *blocker;
68 69
} BlkMigDevState;

L
lirans@il.ibm.com 已提交
70
typedef struct BlkMigBlock {
71
    /* Only used by migration thread.  */
L
lirans@il.ibm.com 已提交
72 73 74
    uint8_t *buf;
    BlkMigDevState *bmds;
    int64_t sector;
75
    int nr_sectors;
L
lirans@il.ibm.com 已提交
76 77
    struct iovec iov;
    QEMUIOVector qiov;
78
    BlockAIOCB *aiocb;
79

P
Paolo Bonzini 已提交
80
    /* Protected by block migration lock.  */
L
lirans@il.ibm.com 已提交
81
    int ret;
82
    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
L
lirans@il.ibm.com 已提交
83 84 85
} BlkMigBlock;

typedef struct BlkMigState {
86
    /* Written during setup phase.  Can be read without a lock.  */
L
lirans@il.ibm.com 已提交
87 88
    int blk_enable;
    int shared_base;
89
    QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
90
    int64_t total_sector_sum;
91
    bool zero_blocks;
92

P
Paolo Bonzini 已提交
93
    /* Protected by lock.  */
94
    QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
L
lirans@il.ibm.com 已提交
95 96
    int submitted;
    int read_done;
97 98

    /* Only used by migration thread.  Does not need a lock.  */
L
lirans@il.ibm.com 已提交
99
    int transferred;
100
    int prev_progress;
L
Liran Schour 已提交
101
    int bulk_completed;
P
Paolo Bonzini 已提交
102 103 104

    /* Lock must be taken _inside_ the iothread lock.  */
    QemuMutex lock;
L
lirans@il.ibm.com 已提交
105 106
} BlkMigState;

107
static BlkMigState block_mig_state;
L
lirans@il.ibm.com 已提交
108

P
Paolo Bonzini 已提交
109 110 111 112 113 114 115 116 117 118
static void blk_mig_lock(void)
{
    qemu_mutex_lock(&block_mig_state.lock);
}

static void blk_mig_unlock(void)
{
    qemu_mutex_unlock(&block_mig_state.lock);
}

119 120 121 122
/* Must run outside of the iothread lock during the bulk phase,
 * or the VM will stall.
 */

123 124 125
static void blk_send(QEMUFile *f, BlkMigBlock * blk)
{
    int len;
126 127 128 129 130 131
    uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;

    if (block_mig_state.zero_blocks &&
        buffer_is_zero(blk->buf, BLOCK_SIZE)) {
        flags |= BLK_MIG_FLAG_ZERO_BLOCK;
    }
132 133 134

    /* sector number and flags */
    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
135
                     | flags);
136 137

    /* device name */
138
    len = strlen(bdrv_get_device_name(blk->bmds->bs));
139
    qemu_put_byte(f, len);
140
    qemu_put_buffer(f, (uint8_t *)bdrv_get_device_name(blk->bmds->bs), len);
141

142 143 144 145 146 147 148 149
    /* if a block is zero we need to flush here since the network
     * bandwidth is now a lot higher than the storage device bandwidth.
     * thus if we queue zero blocks we slow down the migration */
    if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
        qemu_fflush(f);
        return;
    }

150 151 152
    qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
}

153 154 155 156 157 158 159 160 161 162
int blk_mig_active(void)
{
    return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
}

uint64_t blk_mig_bytes_transferred(void)
{
    BlkMigDevState *bmds;
    uint64_t sum = 0;

P
Paolo Bonzini 已提交
163
    blk_mig_lock();
164 165 166
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        sum += bmds->completed_sectors;
    }
P
Paolo Bonzini 已提交
167
    blk_mig_unlock();
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
    return sum << BDRV_SECTOR_BITS;
}

uint64_t blk_mig_bytes_remaining(void)
{
    return blk_mig_bytes_total() - blk_mig_bytes_transferred();
}

uint64_t blk_mig_bytes_total(void)
{
    BlkMigDevState *bmds;
    uint64_t sum = 0;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        sum += bmds->total_sectors;
    }
    return sum << BDRV_SECTOR_BITS;
}

P
Paolo Bonzini 已提交
187 188 189

/* Called with migration lock held.  */

190 191 192 193
static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
{
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;

194
    if (sector < bdrv_nb_sectors(bmds->bs)) {
195 196 197 198 199 200 201
        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
    } else {
        return 0;
    }
}

P
Paolo Bonzini 已提交
202 203
/* Called with migration lock held.  */

204 205 206 207 208 209 210 211 212 213 214 215 216 217
static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
                             int nb_sectors, int set)
{
    int64_t start, end;
    unsigned long val, idx, bit;

    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;

    for (; start <= end; start++) {
        idx = start / (sizeof(unsigned long) * 8);
        bit = start % (sizeof(unsigned long) * 8);
        val = bmds->aio_bitmap[idx];
        if (set) {
218
            val |= 1UL << bit;
219
        } else {
220
            val &= ~(1UL << bit);
221 222 223 224 225 226 227 228 229 230
        }
        bmds->aio_bitmap[idx] = val;
    }
}

static void alloc_aio_bitmap(BlkMigDevState *bmds)
{
    BlockDriverState *bs = bmds->bs;
    int64_t bitmap_size;

231
    bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
232 233
    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;

234
    bmds->aio_bitmap = g_malloc0(bitmap_size);
235 236
}

P
Paolo Bonzini 已提交
237 238
/* Never hold migration lock when yielding to the main loop!  */

L
lirans@il.ibm.com 已提交
239 240 241
static void blk_mig_read_cb(void *opaque, int ret)
{
    BlkMigBlock *blk = opaque;
242

P
Paolo Bonzini 已提交
243
    blk_mig_lock();
L
lirans@il.ibm.com 已提交
244
    blk->ret = ret;
245

246
    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
247
    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
248

249 250 251
    block_mig_state.submitted--;
    block_mig_state.read_done++;
    assert(block_mig_state.submitted >= 0);
P
Paolo Bonzini 已提交
252
    blk_mig_unlock();
L
lirans@il.ibm.com 已提交
253 254
}

255 256
/* Called with no lock taken.  */

257
static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
258
{
259 260 261
    int64_t total_sectors = bmds->total_sectors;
    int64_t cur_sector = bmds->cur_sector;
    BlockDriverState *bs = bmds->bs;
L
lirans@il.ibm.com 已提交
262
    BlkMigBlock *blk;
263
    int nr_sectors;
264

265
    if (bmds->shared_base) {
266
        qemu_mutex_lock_iothread();
267
        while (cur_sector < total_sectors &&
268 269
               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
                                  &nr_sectors)) {
L
lirans@il.ibm.com 已提交
270 271
            cur_sector += nr_sectors;
        }
272
        qemu_mutex_unlock_iothread();
L
lirans@il.ibm.com 已提交
273
    }
274 275

    if (cur_sector >= total_sectors) {
276
        bmds->cur_sector = bmds->completed_sectors = total_sectors;
L
lirans@il.ibm.com 已提交
277 278
        return 1;
    }
279

280
    bmds->completed_sectors = cur_sector;
281

282 283
    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);

J
Jan Kiszka 已提交
284 285
    /* we are going to transfer a full block even if it is not allocated */
    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
L
lirans@il.ibm.com 已提交
286

J
Jan Kiszka 已提交
287
    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
288
        nr_sectors = total_sectors - cur_sector;
L
lirans@il.ibm.com 已提交
289
    }
290

291
    blk = g_new(BlkMigBlock, 1);
292
    blk->buf = g_malloc(BLOCK_SIZE);
293 294
    blk->bmds = bmds;
    blk->sector = cur_sector;
295
    blk->nr_sectors = nr_sectors;
296

L
Liran Schour 已提交
297 298 299
    blk->iov.iov_base = blk->buf;
    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
300

P
Paolo Bonzini 已提交
301
    blk_mig_lock();
302
    block_mig_state.submitted++;
P
Paolo Bonzini 已提交
303
    blk_mig_unlock();
304

305
    qemu_mutex_lock_iothread();
L
Liran Schour 已提交
306 307
    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
                                nr_sectors, blk_mig_read_cb, blk);
308

309
    bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector, nr_sectors);
310
    qemu_mutex_unlock_iothread();
311

312
    bmds->cur_sector = cur_sector + nr_sectors;
313
    return (bmds->cur_sector >= total_sectors);
L
lirans@il.ibm.com 已提交
314 315
}

316 317
/* Called with iothread lock taken.  */

318
static int set_dirty_tracking(void)
L
lirans@il.ibm.com 已提交
319 320
{
    BlkMigDevState *bmds;
321 322 323 324
    int ret;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE,
325
                                                      NULL, NULL);
326 327 328 329 330 331
        if (!bmds->dirty_bitmap) {
            ret = -errno;
            goto fail;
        }
    }
    return 0;
332

333
fail:
334
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
335 336 337
        if (bmds->dirty_bitmap) {
            bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
        }
F
Fam Zheng 已提交
338
    }
339
    return ret;
F
Fam Zheng 已提交
340 341 342 343 344 345 346 347
}

static void unset_dirty_tracking(void)
{
    BlkMigDevState *bmds;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
L
lirans@il.ibm.com 已提交
348 349 350
    }
}

351
static void init_blk_migration(QEMUFile *f)
L
lirans@il.ibm.com 已提交
352
{
353
    BlockDriverState *bs;
354
    BlkMigDevState *bmds;
355
    int64_t sectors;
356

357 358 359 360 361 362 363 364 365 366 367 368 369
    block_mig_state.submitted = 0;
    block_mig_state.read_done = 0;
    block_mig_state.transferred = 0;
    block_mig_state.total_sector_sum = 0;
    block_mig_state.prev_progress = -1;
    block_mig_state.bulk_completed = 0;
    block_mig_state.zero_blocks = migrate_zero_blocks();

    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
        if (bdrv_is_read_only(bs)) {
            continue;
        }

370
        sectors = bdrv_nb_sectors(bs);
371
        if (sectors <= 0) {
372 373 374
            return;
        }

375
        bmds = g_new0(BlkMigDevState, 1);
376 377 378 379 380
        bmds->bs = bs;
        bmds->bulk_completed = 0;
        bmds->total_sectors = sectors;
        bmds->completed_sectors = 0;
        bmds->shared_base = block_mig_state.shared_base;
381
        alloc_aio_bitmap(bmds);
382 383
        error_setg(&bmds->blocker, "block device is in use by migration");
        bdrv_op_block_all(bs, bmds->blocker);
384
        bdrv_ref(bs);
385 386 387 388

        block_mig_state.total_sector_sum += sectors;

        if (bmds->shared_base) {
389
            DPRINTF("Start migration for %s with shared base image\n",
390
                    bdrv_get_device_name(bs));
391
        } else {
392
            DPRINTF("Start full migration for %s\n", bdrv_get_device_name(bs));
393 394 395 396 397 398
        }

        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
    }
}

399 400
/* Called with no lock taken.  */

401
static int blk_mig_save_bulked_block(QEMUFile *f)
L
lirans@il.ibm.com 已提交
402
{
403
    int64_t completed_sector_sum = 0;
L
lirans@il.ibm.com 已提交
404
    BlkMigDevState *bmds;
405
    int progress;
406
    int ret = 0;
L
lirans@il.ibm.com 已提交
407

408
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
409
        if (bmds->bulk_completed == 0) {
410
            if (mig_save_device_bulk(f, bmds) == 1) {
411 412
                /* completed bulk section for this device */
                bmds->bulk_completed = 1;
L
lirans@il.ibm.com 已提交
413
            }
414 415 416 417 418
            completed_sector_sum += bmds->completed_sectors;
            ret = 1;
            break;
        } else {
            completed_sector_sum += bmds->completed_sectors;
L
lirans@il.ibm.com 已提交
419 420
        }
    }
421

422 423 424 425 426 427
    if (block_mig_state.total_sector_sum != 0) {
        progress = completed_sector_sum * 100 /
                   block_mig_state.total_sector_sum;
    } else {
        progress = 100;
    }
428 429 430 431
    if (progress != block_mig_state.prev_progress) {
        block_mig_state.prev_progress = progress;
        qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
                         | BLK_MIG_FLAG_PROGRESS);
432
        DPRINTF("Completed %d %%\r", progress);
433 434 435
    }

    return ret;
L
lirans@il.ibm.com 已提交
436 437
}

438
static void blk_mig_reset_dirty_cursor(void)
L
lirans@il.ibm.com 已提交
439 440
{
    BlkMigDevState *bmds;
441 442 443 444 445 446

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        bmds->cur_dirty = 0;
    }
}

447 448
/* Called with iothread lock taken.  */

449 450
static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
                                 int is_async)
451 452 453
{
    BlkMigBlock *blk;
    int64_t total_sectors = bmds->total_sectors;
L
lirans@il.ibm.com 已提交
454
    int64_t sector;
455
    int nr_sectors;
456
    int ret = -EIO;
457

458
    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
P
Paolo Bonzini 已提交
459
        blk_mig_lock();
460
        if (bmds_aio_inflight(bmds, sector)) {
P
Paolo Bonzini 已提交
461
            blk_mig_unlock();
462
            bdrv_drain(bmds->bs);
P
Paolo Bonzini 已提交
463 464
        } else {
            blk_mig_unlock();
465
        }
F
Fam Zheng 已提交
466
        if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) {
467

468 469 470 471 472
            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
                nr_sectors = total_sectors - sector;
            } else {
                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
            }
473
            blk = g_new(BlkMigBlock, 1);
474
            blk->buf = g_malloc(BLOCK_SIZE);
475 476
            blk->bmds = bmds;
            blk->sector = sector;
477
            blk->nr_sectors = nr_sectors;
478

479
            if (is_async) {
480 481 482 483 484 485
                blk->iov.iov_base = blk->buf;
                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);

                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
                                            nr_sectors, blk_mig_read_cb, blk);
P
Paolo Bonzini 已提交
486 487

                blk_mig_lock();
488
                block_mig_state.submitted++;
489
                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
P
Paolo Bonzini 已提交
490
                blk_mig_unlock();
491
            } else {
492 493
                ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
                if (ret < 0) {
494
                    goto error;
L
lirans@il.ibm.com 已提交
495
                }
496
                blk_send(f, blk);
497

498 499
                g_free(blk->buf);
                g_free(blk);
500
            }
501

502
            bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, sector, nr_sectors);
503
            break;
L
lirans@il.ibm.com 已提交
504
        }
505 506
        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
        bmds->cur_dirty = sector;
L
lirans@il.ibm.com 已提交
507
    }
508

509 510
    return (bmds->cur_dirty >= bmds->total_sectors);

511
error:
512
    DPRINTF("Error reading sector %" PRId64 "\n", sector);
513 514
    g_free(blk->buf);
    g_free(blk);
515
    return ret;
516 517
}

518 519 520
/* Called with iothread lock taken.
 *
 * return value:
521 522 523
 * 0: too much data for max_downtime
 * 1: few enough data for max_downtime
*/
524
static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
525 526
{
    BlkMigDevState *bmds;
527
    int ret = 1;
528 529

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
530
        ret = mig_save_device_dirty(f, bmds, is_async);
531
        if (ret <= 0) {
532 533 534 535 536
            break;
        }
    }

    return ret;
L
lirans@il.ibm.com 已提交
537 538
}

539 540
/* Called with no locks taken.  */

541
static int flush_blks(QEMUFile *f)
L
lirans@il.ibm.com 已提交
542
{
543
    BlkMigBlock *blk;
544
    int ret = 0;
545

M
malc 已提交
546
    DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
547 548
            __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
            block_mig_state.transferred);
549

P
Paolo Bonzini 已提交
550
    blk_mig_lock();
551 552 553 554
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
        if (qemu_file_rate_limit(f)) {
            break;
        }
555
        if (blk->ret < 0) {
556
            ret = blk->ret;
557 558
            break;
        }
559

560
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
P
Paolo Bonzini 已提交
561
        blk_mig_unlock();
562
        blk_send(f, blk);
P
Paolo Bonzini 已提交
563
        blk_mig_lock();
564

565 566
        g_free(blk->buf);
        g_free(blk);
567

568 569 570
        block_mig_state.read_done--;
        block_mig_state.transferred++;
        assert(block_mig_state.read_done >= 0);
L
lirans@il.ibm.com 已提交
571
    }
P
Paolo Bonzini 已提交
572
    blk_mig_unlock();
L
lirans@il.ibm.com 已提交
573

M
malc 已提交
574
    DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
575 576
            block_mig_state.submitted, block_mig_state.read_done,
            block_mig_state.transferred);
577
    return ret;
L
lirans@il.ibm.com 已提交
578 579
}

580 581
/* Called with iothread lock taken.  */

582 583 584 585 586 587
static int64_t get_remaining_dirty(void)
{
    BlkMigDevState *bmds;
    int64_t dirty = 0;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
588
        dirty += bdrv_get_dirty_count(bmds->dirty_bitmap);
589 590
    }

591
    return dirty << BDRV_SECTOR_BITS;
592 593
}

594 595
/* Called with iothread lock taken.  */

L
Liang Li 已提交
596
static void block_migration_cleanup(void *opaque)
597
{
598 599
    BlkMigDevState *bmds;
    BlkMigBlock *blk;
600

601 602
    bdrv_drain_all();

F
Fam Zheng 已提交
603
    unset_dirty_tracking();
604

P
Paolo Bonzini 已提交
605
    blk_mig_lock();
606 607
    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
608 609
        bdrv_op_unblock_all(bmds->bs, bmds->blocker);
        error_free(bmds->blocker);
610
        bdrv_unref(bmds->bs);
611 612
        g_free(bmds->aio_bitmap);
        g_free(bmds);
613 614
    }

615 616
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
617 618
        g_free(blk->buf);
        g_free(blk);
619
    }
P
Paolo Bonzini 已提交
620
    blk_mig_unlock();
621 622
}

623
static int block_save_setup(QEMUFile *f, void *opaque)
L
lirans@il.ibm.com 已提交
624
{
625 626
    int ret;

627 628
    DPRINTF("Enter save live setup submitted %d transferred %d\n",
            block_mig_state.submitted, block_mig_state.transferred);
629

630
    qemu_mutex_lock_iothread();
631
    init_blk_migration(f);
632 633

    /* start track dirty blocks */
634 635 636 637 638 639 640
    ret = set_dirty_tracking();

    if (ret) {
        qemu_mutex_unlock_iothread();
        return ret;
    }

641
    qemu_mutex_unlock_iothread();
642

643
    ret = flush_blks(f);
644 645 646
    blk_mig_reset_dirty_cursor();
    qemu_put_be64(f, BLK_MIG_FLAG_EOS);

647
    return ret;
648 649
}

650
static int block_save_iterate(QEMUFile *f, void *opaque)
651 652
{
    int ret;
653
    int64_t last_ftell = qemu_ftell(f);
G
Gary R Hook 已提交
654
    int64_t delta_ftell;
655

656 657
    DPRINTF("Enter save live iterate submitted %d transferred %d\n",
            block_mig_state.submitted, block_mig_state.transferred);
658

659
    ret = flush_blks(f);
660 661
    if (ret) {
        return ret;
662 663
    }

664 665
    blk_mig_reset_dirty_cursor();

666
    /* control the rate of transfer */
P
Paolo Bonzini 已提交
667
    blk_mig_lock();
668 669
    while ((block_mig_state.submitted +
            block_mig_state.read_done) * BLOCK_SIZE <
670 671 672 673
           qemu_file_get_rate_limit(f) &&
           (block_mig_state.submitted +
            block_mig_state.read_done) <
           MAX_INFLIGHT_IO) {
P
Paolo Bonzini 已提交
674
        blk_mig_unlock();
675 676 677 678 679 680
        if (block_mig_state.bulk_completed == 0) {
            /* first finish the bulk phase */
            if (blk_mig_save_bulked_block(f) == 0) {
                /* finished saving bulk on all devices */
                block_mig_state.bulk_completed = 1;
            }
681
            ret = 0;
682
        } else {
683 684 685 686
            /* Always called with iothread lock taken for
             * simplicity, block_save_complete also calls it.
             */
            qemu_mutex_lock_iothread();
687
            ret = blk_mig_save_dirty_block(f, 1);
688
            qemu_mutex_unlock_iothread();
689 690 691 692
        }
        if (ret < 0) {
            return ret;
        }
P
Paolo Bonzini 已提交
693
        blk_mig_lock();
694 695 696
        if (ret != 0) {
            /* no more dirty blocks */
            break;
697
        }
698
    }
P
Paolo Bonzini 已提交
699
    blk_mig_unlock();
700

701
    ret = flush_blks(f);
702 703
    if (ret) {
        return ret;
704 705
    }

706
    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
G
Gary R Hook 已提交
707 708 709 710 711 712 713 714
    delta_ftell = qemu_ftell(f) - last_ftell;
    if (delta_ftell > 0) {
        return 1;
    } else if (delta_ftell < 0) {
        return -1;
    } else {
        return 0;
    }
715 716
}

717 718
/* Called with iothread lock taken.  */

719 720 721 722 723 724 725
static int block_save_complete(QEMUFile *f, void *opaque)
{
    int ret;

    DPRINTF("Enter save live complete submitted %d transferred %d\n",
            block_mig_state.submitted, block_mig_state.transferred);

726
    ret = flush_blks(f);
727 728 729
    if (ret) {
        return ret;
    }
730

731
    blk_mig_reset_dirty_cursor();
732

733 734
    /* we know for sure that save bulk is completed and
       all async read completed */
P
Paolo Bonzini 已提交
735
    blk_mig_lock();
736
    assert(block_mig_state.submitted == 0);
P
Paolo Bonzini 已提交
737
    blk_mig_unlock();
738

739 740
    do {
        ret = blk_mig_save_dirty_block(f, 0);
741 742 743
        if (ret < 0) {
            return ret;
        }
744
    } while (ret == 0);
745

746 747
    /* report completion */
    qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
748

749 750
    DPRINTF("Block migration completed\n");

751 752
    qemu_put_be64(f, BLK_MIG_FLAG_EOS);

753
    return 0;
L
lirans@il.ibm.com 已提交
754 755
}

756 757 758
static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
                               uint64_t *non_postcopiable_pending,
                               uint64_t *postcopiable_pending)
759
{
760
    /* Estimate pending number of bytes to send */
761 762
    uint64_t pending;

763
    qemu_mutex_lock_iothread();
P
Paolo Bonzini 已提交
764
    blk_mig_lock();
765
    pending = get_remaining_dirty() +
766 767 768 769
                       block_mig_state.submitted * BLOCK_SIZE +
                       block_mig_state.read_done * BLOCK_SIZE;

    /* Report at least one block pending during bulk phase */
770 771
    if (pending <= max_size && !block_mig_state.bulk_completed) {
        pending = max_size + BLOCK_SIZE;
772
    }
P
Paolo Bonzini 已提交
773
    blk_mig_unlock();
774
    qemu_mutex_unlock_iothread();
775

776
    DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
777 778
    /* We don't do postcopy */
    *non_postcopiable_pending += pending;
779 780
}

L
lirans@il.ibm.com 已提交
781 782
static int block_load(QEMUFile *f, void *opaque, int version_id)
{
783
    static int banner_printed;
L
lirans@il.ibm.com 已提交
784 785 786
    int len, flags;
    char device_name[256];
    int64_t addr;
787
    BlockDriverState *bs, *bs_prev = NULL;
788
    BlockBackend *blk;
L
lirans@il.ibm.com 已提交
789
    uint8_t *buf;
790 791
    int64_t total_sectors = 0;
    int nr_sectors;
792
    int ret;
793

L
lirans@il.ibm.com 已提交
794 795
    do {
        addr = qemu_get_be64(f);
796

J
Jan Kiszka 已提交
797 798
        flags = addr & ~BDRV_SECTOR_MASK;
        addr >>= BDRV_SECTOR_BITS;
799 800

        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
L
lirans@il.ibm.com 已提交
801 802 803 804
            /* get device name */
            len = qemu_get_byte(f);
            qemu_get_buffer(f, (uint8_t *)device_name, len);
            device_name[len] = '\0';
805

806 807
            blk = blk_by_name(device_name);
            if (!blk) {
808 809 810 811
                fprintf(stderr, "Error unknown block device %s\n",
                        device_name);
                return -EINVAL;
            }
812
            bs = blk_bs(blk);
M
Max Reitz 已提交
813 814 815 816 817
            if (!bs) {
                fprintf(stderr, "Block device %s has no medium\n",
                        device_name);
                return -EINVAL;
            }
818

819 820
            if (bs != bs_prev) {
                bs_prev = bs;
821
                total_sectors = bdrv_nb_sectors(bs);
822
                if (total_sectors <= 0) {
823
                    error_report("Error getting length of block device %s",
824 825 826 827 828 829 830 831 832 833 834
                                 device_name);
                    return -EINVAL;
                }
            }

            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
                nr_sectors = total_sectors - addr;
            } else {
                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
            }

835
            if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
836 837
                ret = bdrv_write_zeroes(bs, addr, nr_sectors,
                                        BDRV_REQ_MAY_UNMAP);
838 839 840 841 842 843
            } else {
                buf = g_malloc(BLOCK_SIZE);
                qemu_get_buffer(f, buf, BLOCK_SIZE);
                ret = bdrv_write(bs, addr, buf, nr_sectors);
                g_free(buf);
            }
844

845 846 847
            if (ret < 0) {
                return ret;
            }
848 849 850 851 852 853 854 855
        } else if (flags & BLK_MIG_FLAG_PROGRESS) {
            if (!banner_printed) {
                printf("Receiving block device images\n");
                banner_printed = 1;
            }
            printf("Completed %d %%%c", (int)addr,
                   (addr == 100) ? '\n' : '\r');
            fflush(stdout);
856
        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
857
            fprintf(stderr, "Unknown block migration flags: %#x\n", flags);
858 859
            return -EINVAL;
        }
860 861 862
        ret = qemu_file_get_error(f);
        if (ret != 0) {
            return ret;
L
lirans@il.ibm.com 已提交
863
        }
864 865
    } while (!(flags & BLK_MIG_FLAG_EOS));

L
lirans@il.ibm.com 已提交
866 867 868
    return 0;
}

I
Isaku Yamahata 已提交
869
static void block_set_params(const MigrationParams *params, void *opaque)
L
lirans@il.ibm.com 已提交
870
{
I
Isaku Yamahata 已提交
871 872
    block_mig_state.blk_enable = params->blk;
    block_mig_state.shared_base = params->shared;
873

L
lirans@il.ibm.com 已提交
874
    /* shared base means that blk_enable = 1 */
I
Isaku Yamahata 已提交
875
    block_mig_state.blk_enable |= params->shared;
L
lirans@il.ibm.com 已提交
876 877
}

878 879 880 881 882
static bool block_is_active(void *opaque)
{
    return block_mig_state.blk_enable == 1;
}

883
static SaveVMHandlers savevm_block_handlers = {
884
    .set_params = block_set_params,
885
    .save_live_setup = block_save_setup,
886
    .save_live_iterate = block_save_iterate,
887
    .save_live_complete_precopy = block_save_complete,
888
    .save_live_pending = block_save_pending,
889
    .load_state = block_load,
L
Liang Li 已提交
890
    .cleanup = block_migration_cleanup,
891
    .is_active = block_is_active,
892 893
};

L
lirans@il.ibm.com 已提交
894
void blk_mig_init(void)
895
{
896 897
    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
    QSIMPLEQ_INIT(&block_mig_state.blk_list);
P
Paolo Bonzini 已提交
898
    qemu_mutex_init(&block_mig_state.lock);
899

900 901
    register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
                         &block_mig_state);
L
lirans@il.ibm.com 已提交
902
}