ram.c 79.5 KB
Newer Older
1 2 3 4
/*
 * QEMU System Emulator
 *
 * Copyright (c) 2003-2008 Fabrice Bellard
5 6 7 8
 * Copyright (c) 2011-2015 Red Hat Inc
 *
 * Authors:
 *  Juan Quintela <quintela@redhat.com>
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
P
Peter Maydell 已提交
28
#include "qemu/osdep.h"
29 30
#include "qemu-common.h"
#include "cpu.h"
31
#include <zlib.h>
32
#include "qapi-event.h"
33
#include "qemu/cutils.h"
34 35
#include "qemu/bitops.h"
#include "qemu/bitmap.h"
J
Juan Quintela 已提交
36 37
#include "qemu/timer.h"
#include "qemu/main-loop.h"
38
#include "migration/migration.h"
39
#include "postcopy-ram.h"
40 41 42 43 44 45
#include "exec/address-spaces.h"
#include "migration/page_cache.h"
#include "qemu/error-report.h"
#include "trace.h"
#include "exec/ram_addr.h"
#include "qemu/rcu_queue.h"
46
#include "migration/colo.h"
47 48 49 50 51 52 53 54 55 56 57 58 59 60

/***********************************************************/
/* ram save/restore */

#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
#define RAM_SAVE_FLAG_COMPRESS 0x02
#define RAM_SAVE_FLAG_MEM_SIZE 0x04
#define RAM_SAVE_FLAG_PAGE     0x08
#define RAM_SAVE_FLAG_EOS      0x10
#define RAM_SAVE_FLAG_CONTINUE 0x20
#define RAM_SAVE_FLAG_XBZRLE   0x40
/* 0x80 is reserved in migration.h start with 0x100 next */
#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100

61
static uint8_t *ZERO_TARGET_PAGE;
62 63 64

static inline bool is_zero_range(uint8_t *p, uint64_t size)
{
65
    return buffer_is_zero(p, size);
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
}

/* struct contains XBZRLE cache and a static page
   used by the compression */
static struct {
    /* buffer used for XBZRLE encoding */
    uint8_t *encoded_buf;
    /* buffer for storing page content */
    uint8_t *current_buf;
    /* Cache for XBZRLE, Protected by lock. */
    PageCache *cache;
    QemuMutex lock;
} XBZRLE;

/* buffer used for XBZRLE decoding */
static uint8_t *xbzrle_decoded_buf;

static void XBZRLE_cache_lock(void)
{
    if (migrate_use_xbzrle())
        qemu_mutex_lock(&XBZRLE.lock);
}

static void XBZRLE_cache_unlock(void)
{
    if (migrate_use_xbzrle())
        qemu_mutex_unlock(&XBZRLE.lock);
}

95 96 97 98 99 100 101 102 103 104 105
/**
 * xbzrle_cache_resize: resize the xbzrle cache
 *
 * This function is called from qmp_migrate_set_cache_size in main
 * thread, possibly while a migration is in progress.  A running
 * migration may be using the cache and might finish during this call,
 * hence changes to the cache are protected by XBZRLE.lock().
 *
 * Returns the new_size or negative in case of error.
 *
 * @new_size: new cache size
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
 */
int64_t xbzrle_cache_resize(int64_t new_size)
{
    PageCache *new_cache;
    int64_t ret;

    if (new_size < TARGET_PAGE_SIZE) {
        return -1;
    }

    XBZRLE_cache_lock();

    if (XBZRLE.cache != NULL) {
        if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
            goto out_new_size;
        }
        new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
                                        TARGET_PAGE_SIZE);
        if (!new_cache) {
            error_report("Error creating cache");
            ret = -1;
            goto out;
        }

        cache_fini(XBZRLE.cache);
        XBZRLE.cache = new_cache;
    }

out_new_size:
    ret = pow2floor(new_size);
out:
    XBZRLE_cache_unlock();
    return ret;
}

141 142 143 144 145 146 147 148 149 150 151 152
/*
 * An outstanding page request, on the source, having been received
 * and queued
 */
struct RAMSrcPageRequest {
    RAMBlock *rb;
    hwaddr    offset;
    hwaddr    len;

    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
};

J
Juan Quintela 已提交
153 154
/* State of RAM for migration */
struct RAMState {
J
Juan Quintela 已提交
155 156
    /* QEMUFile used for this migration */
    QEMUFile *f;
J
Juan Quintela 已提交
157 158 159 160
    /* Last block that we have visited searching for dirty pages */
    RAMBlock *last_seen_block;
    /* Last block from where we have sent data */
    RAMBlock *last_sent_block;
161 162
    /* Last dirty target page we have sent */
    ram_addr_t last_page;
J
Juan Quintela 已提交
163 164 165 166
    /* last ram version we have seen */
    uint32_t last_version;
    /* We are in the first round */
    bool ram_bulk_stage;
167 168
    /* How many times we have dirty too many pages */
    int dirty_rate_high_cnt;
169 170
    /* How many times we have synchronized the bitmap */
    uint64_t bitmap_sync_count;
171 172 173
    /* these variables are used for bitmap sync */
    /* last time we did a full bitmap_sync */
    int64_t time_last_bitmap_sync;
174
    /* bytes transferred at start_time */
175
    uint64_t bytes_xfer_prev;
176
    /* number of dirty pages since start_time */
177
    uint64_t num_dirty_pages_period;
178 179
    /* xbzrle misses since the beginning of the period */
    uint64_t xbzrle_cache_miss_prev;
180 181
    /* number of iterations at the beginning of period */
    uint64_t iterations_prev;
182 183 184
    /* Accounting fields */
    /* number of zero pages.  It used to be pages filled by the same char. */
    uint64_t zero_pages;
J
Juan Quintela 已提交
185 186
    /* number of normal transferred pages */
    uint64_t norm_pages;
187 188
    /* Iterations since start */
    uint64_t iterations;
189 190
    /* xbzrle transmitted bytes.  Notice that this is with
     * compression, they can't be calculated from the pages */
191
    uint64_t xbzrle_bytes;
192 193
    /* xbzrle transmmited pages */
    uint64_t xbzrle_pages;
194 195
    /* xbzrle number of cache miss */
    uint64_t xbzrle_cache_miss;
196 197
    /* xbzrle miss rate */
    double xbzrle_cache_miss_rate;
198 199
    /* xbzrle number of overflows */
    uint64_t xbzrle_overflows;
200 201
    /* number of dirty bits in the bitmap */
    uint64_t migration_dirty_pages;
202 203
    /* total number of bytes transferred */
    uint64_t bytes_transferred;
204 205
    /* number of dirtied pages in the last second */
    uint64_t dirty_pages_rate;
206 207
    /* Count of requests incoming from destination */
    uint64_t postcopy_requests;
208 209
    /* protects modification of the bitmap */
    QemuMutex bitmap_mutex;
210 211
    /* The RAMBlock used in the last src_page_requests */
    RAMBlock *last_req_rb;
212 213 214
    /* Queue of outstanding page requests from the destination */
    QemuMutex src_page_req_mutex;
    QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
J
Juan Quintela 已提交
215 216 217 218 219
};
typedef struct RAMState RAMState;

static RAMState ram_state;

220 221
uint64_t dup_mig_pages_transferred(void)
{
222
    return ram_state.zero_pages;
223 224 225 226
}

uint64_t norm_mig_pages_transferred(void)
{
J
Juan Quintela 已提交
227
    return ram_state.norm_pages;
228 229 230 231
}

uint64_t xbzrle_mig_bytes_transferred(void)
{
232
    return ram_state.xbzrle_bytes;
233 234 235 236
}

uint64_t xbzrle_mig_pages_transferred(void)
{
237
    return ram_state.xbzrle_pages;
238 239 240 241
}

uint64_t xbzrle_mig_pages_cache_miss(void)
{
242
    return ram_state.xbzrle_cache_miss;
243 244 245 246
}

double xbzrle_mig_cache_miss_rate(void)
{
247
    return ram_state.xbzrle_cache_miss_rate;
248 249 250 251
}

uint64_t xbzrle_mig_pages_overflow(void)
{
252
    return ram_state.xbzrle_overflows;
253 254
}

J
Juan Quintela 已提交
255
uint64_t ram_bytes_transferred(void)
256
{
J
Juan Quintela 已提交
257
    return ram_state.bytes_transferred;
258 259
}

J
Juan Quintela 已提交
260
uint64_t ram_bytes_remaining(void)
261
{
J
Juan Quintela 已提交
262
    return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
263 264
}

265 266 267 268 269
uint64_t ram_dirty_sync_count(void)
{
    return ram_state.bitmap_sync_count;
}

270 271 272 273 274
uint64_t ram_dirty_pages_rate(void)
{
    return ram_state.dirty_pages_rate;
}

275 276 277 278 279
uint64_t ram_postcopy_requests(void)
{
    return ram_state.postcopy_requests;
}

280 281 282 283
/* used by the search for pages to send */
struct PageSearchStatus {
    /* Current block being searched */
    RAMBlock    *block;
284 285
    /* Current page to search from */
    unsigned long page;
286 287 288 289 290
    /* Set once we wrap around */
    bool         complete_round;
};
typedef struct PageSearchStatus PageSearchStatus;

291 292
struct CompressParam {
    bool done;
293
    bool quit;
294 295 296 297 298 299 300 301 302
    QEMUFile *file;
    QemuMutex mutex;
    QemuCond cond;
    RAMBlock *block;
    ram_addr_t offset;
};
typedef struct CompressParam CompressParam;

struct DecompressParam {
303
    bool done;
304
    bool quit;
305 306 307
    QemuMutex mutex;
    QemuCond cond;
    void *des;
308
    uint8_t *compbuf;
309 310 311 312 313 314 315 316 317 318
    int len;
};
typedef struct DecompressParam DecompressParam;

static CompressParam *comp_param;
static QemuThread *compress_threads;
/* comp_done_cond is used to wake up the migration thread when
 * one of the compression threads has finished the compression.
 * comp_done_lock is used to co-work with comp_done_cond.
 */
L
Liang Li 已提交
319 320
static QemuMutex comp_done_lock;
static QemuCond comp_done_cond;
321 322 323 324 325
/* The empty QEMUFileOps will be used by file in CompressParam */
static const QEMUFileOps empty_ops = { };

static DecompressParam *decomp_param;
static QemuThread *decompress_threads;
326 327
static QemuMutex decomp_done_lock;
static QemuCond decomp_done_cond;
328

329 330
static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
                                ram_addr_t offset);
331 332 333 334

static void *do_data_compress(void *opaque)
{
    CompressParam *param = opaque;
335 336
    RAMBlock *block;
    ram_addr_t offset;
337

338
    qemu_mutex_lock(&param->mutex);
339
    while (!param->quit) {
340 341 342 343 344 345 346 347
        if (param->block) {
            block = param->block;
            offset = param->offset;
            param->block = NULL;
            qemu_mutex_unlock(&param->mutex);

            do_compress_ram_page(param->file, block, offset);

L
Liang Li 已提交
348
            qemu_mutex_lock(&comp_done_lock);
349
            param->done = true;
L
Liang Li 已提交
350 351
            qemu_cond_signal(&comp_done_cond);
            qemu_mutex_unlock(&comp_done_lock);
352 353 354

            qemu_mutex_lock(&param->mutex);
        } else {
355 356 357
            qemu_cond_wait(&param->cond, &param->mutex);
        }
    }
358
    qemu_mutex_unlock(&param->mutex);
359 360 361 362 363 364 365 366 367

    return NULL;
}

static inline void terminate_compression_threads(void)
{
    int idx, thread_count;

    thread_count = migrate_compress_threads();
368

369 370
    for (idx = 0; idx < thread_count; idx++) {
        qemu_mutex_lock(&comp_param[idx].mutex);
371
        comp_param[idx].quit = true;
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391
        qemu_cond_signal(&comp_param[idx].cond);
        qemu_mutex_unlock(&comp_param[idx].mutex);
    }
}

void migrate_compress_threads_join(void)
{
    int i, thread_count;

    if (!migrate_use_compression()) {
        return;
    }
    terminate_compression_threads();
    thread_count = migrate_compress_threads();
    for (i = 0; i < thread_count; i++) {
        qemu_thread_join(compress_threads + i);
        qemu_fclose(comp_param[i].file);
        qemu_mutex_destroy(&comp_param[i].mutex);
        qemu_cond_destroy(&comp_param[i].cond);
    }
L
Liang Li 已提交
392 393
    qemu_mutex_destroy(&comp_done_lock);
    qemu_cond_destroy(&comp_done_cond);
394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409
    g_free(compress_threads);
    g_free(comp_param);
    compress_threads = NULL;
    comp_param = NULL;
}

void migrate_compress_threads_create(void)
{
    int i, thread_count;

    if (!migrate_use_compression()) {
        return;
    }
    thread_count = migrate_compress_threads();
    compress_threads = g_new0(QemuThread, thread_count);
    comp_param = g_new0(CompressParam, thread_count);
L
Liang Li 已提交
410 411
    qemu_cond_init(&comp_done_cond);
    qemu_mutex_init(&comp_done_lock);
412
    for (i = 0; i < thread_count; i++) {
C
Cao jin 已提交
413 414
        /* comp_param[i].file is just used as a dummy buffer to save data,
         * set its ops to empty.
415 416 417
         */
        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
        comp_param[i].done = true;
418
        comp_param[i].quit = false;
419 420 421 422 423 424 425 426 427
        qemu_mutex_init(&comp_param[i].mutex);
        qemu_cond_init(&comp_param[i].cond);
        qemu_thread_create(compress_threads + i, "compress",
                           do_data_compress, comp_param + i,
                           QEMU_THREAD_JOINABLE);
    }
}

/**
428
 * save_page_header: write page header to wire
429 430 431
 *
 * If this is the 1st block, it also writes the block identification
 *
432
 * Returns the number of bytes written
433 434 435 436 437 438
 *
 * @f: QEMUFile where to send the data
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 *          in the lower bits, it contains flags
 */
J
Juan Quintela 已提交
439
static size_t save_page_header(RAMState *rs, RAMBlock *block, ram_addr_t offset)
440
{
441
    size_t size, len;
442

J
Juan Quintela 已提交
443 444 445 446
    if (block == rs->last_sent_block) {
        offset |= RAM_SAVE_FLAG_CONTINUE;
    }
    qemu_put_be64(rs->f, offset);
447 448 449
    size = 8;

    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
450
        len = strlen(block->idstr);
J
Juan Quintela 已提交
451 452
        qemu_put_byte(rs->f, len);
        qemu_put_buffer(rs->f, (uint8_t *)block->idstr, len);
453
        size += 1 + len;
J
Juan Quintela 已提交
454
        rs->last_sent_block = block;
455 456 457 458
    }
    return size;
}

459 460 461 462 463 464 465 466
/**
 * mig_throttle_guest_down: throotle down the guest
 *
 * Reduce amount of guest cpu execution to hopefully slow down memory
 * writes. If guest dirty memory rate is reduced below the rate at
 * which we can transfer pages to the destination then we should be
 * able to complete migration. Some workloads dirty memory way too
 * fast and will not effectively converge, even with auto-converge.
467 468 469 470
 */
static void mig_throttle_guest_down(void)
{
    MigrationState *s = migrate_get_current();
471 472
    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
    uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
473 474 475 476 477 478 479 480 481 482

    /* We have not started throttling yet. Let's start it. */
    if (!cpu_throttle_active()) {
        cpu_throttle_set(pct_initial);
    } else {
        /* Throttling already on, just increase the rate */
        cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
    }
}

483 484 485
/**
 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 *
J
Juan Quintela 已提交
486
 * @rs: current RAM state
487 488 489
 * @current_addr: address for the zero page
 *
 * Update the xbzrle cache to reflect a page that's been sent as all 0.
490 491 492
 * The important thing is that a stale (not-yet-0'd) page be replaced
 * by the new data.
 * As a bonus, if the page wasn't in the cache it gets added so that
493
 * when a small write is made into the 0'd page it gets XBZRLE sent.
494
 */
J
Juan Quintela 已提交
495
static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
496
{
J
Juan Quintela 已提交
497
    if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
498 499 500 501 502 503
        return;
    }

    /* We don't care if this fails to allocate a new cache page
     * as long as it updated an old one */
    cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
504
                 rs->bitmap_sync_count);
505 506 507 508 509 510 511 512 513 514 515
}

#define ENCODING_FLAG_XBZRLE 0x1

/**
 * save_xbzrle_page: compress and send current page
 *
 * Returns: 1 means that we wrote the page
 *          0 means that page is identical to the one already sent
 *          -1 means that xbzrle would be longer than normal
 *
516
 * @rs: current RAM state
517 518
 * @current_data: pointer to the address of the page contents
 * @current_addr: addr of the page
519 520 521 522
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 * @last_stage: if we are at the completion stage
 */
J
Juan Quintela 已提交
523
static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
524
                            ram_addr_t current_addr, RAMBlock *block,
525
                            ram_addr_t offset, bool last_stage)
526 527 528 529
{
    int encoded_len = 0, bytes_xbzrle;
    uint8_t *prev_cached_page;

530
    if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
531
        rs->xbzrle_cache_miss++;
532 533
        if (!last_stage) {
            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
534
                             rs->bitmap_sync_count) == -1) {
535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554
                return -1;
            } else {
                /* update *current_data when the page has been
                   inserted into cache */
                *current_data = get_cached_data(XBZRLE.cache, current_addr);
            }
        }
        return -1;
    }

    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);

    /* save current buffer into memory */
    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);

    /* XBZRLE encoding (if there is no overflow) */
    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
                                       TARGET_PAGE_SIZE);
    if (encoded_len == 0) {
555
        trace_save_xbzrle_page_skipping();
556 557
        return 0;
    } else if (encoded_len == -1) {
558
        trace_save_xbzrle_page_overflow();
559
        rs->xbzrle_overflows++;
560 561 562 563 564 565 566 567 568 569 570 571 572 573
        /* update data in the cache */
        if (!last_stage) {
            memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
            *current_data = prev_cached_page;
        }
        return -1;
    }

    /* we need to update the data in the cache, in order to get the same data */
    if (!last_stage) {
        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
    }

    /* Send XBZRLE based compressed page */
J
Juan Quintela 已提交
574
    bytes_xbzrle = save_page_header(rs, block,
J
Juan Quintela 已提交
575 576 577 578
                                    offset | RAM_SAVE_FLAG_XBZRLE);
    qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
    qemu_put_be16(rs->f, encoded_len);
    qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
579
    bytes_xbzrle += encoded_len + 1 + 2;
580
    rs->xbzrle_pages++;
581
    rs->xbzrle_bytes += bytes_xbzrle;
582
    rs->bytes_transferred += bytes_xbzrle;
583 584 585 586

    return 1;
}

587 588
/**
 * migration_bitmap_find_dirty: find the next dirty page from start
589
 *
590 591 592 593
 * Called with rcu_read_lock() to protect migration_bitmap
 *
 * Returns the byte offset within memory region of the start of a dirty page
 *
J
Juan Quintela 已提交
594
 * @rs: current RAM state
595
 * @rb: RAMBlock where to search for dirty pages
596
 * @start: page where we start the search
597
 */
598
static inline
599
unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
600
                                          unsigned long start)
601
{
602 603
    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
    unsigned long *bitmap = rb->bmap;
604 605
    unsigned long next;

606 607
    if (rs->ram_bulk_stage && start > 0) {
        next = start + 1;
608
    } else {
609
        next = find_next_bit(bitmap, size, start);
610 611
    }

612
    return next;
613 614
}

615
static inline bool migration_bitmap_clear_dirty(RAMState *rs,
616 617
                                                RAMBlock *rb,
                                                unsigned long page)
618 619 620
{
    bool ret;

621
    ret = test_and_clear_bit(page, rb->bmap);
622 623

    if (ret) {
624
        rs->migration_dirty_pages--;
625 626 627 628
    }
    return ret;
}

629 630
static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
                                        ram_addr_t start, ram_addr_t length)
631
{
632
    rs->migration_dirty_pages +=
633
        cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
634
                                              &rs->num_dirty_pages_period);
635 636
}

637 638 639 640 641 642 643 644
/**
 * ram_pagesize_summary: calculate all the pagesizes of a VM
 *
 * Returns a summary bitmap of the page sizes of all RAMBlocks
 *
 * For VMs with just normal pages this is equivalent to the host page
 * size. If it's got some huge pages then it's the OR of all the
 * different page sizes.
645 646 647 648 649 650
 */
uint64_t ram_pagesize_summary(void)
{
    RAMBlock *block;
    uint64_t summary = 0;

P
Peter Xu 已提交
651
    RAMBLOCK_FOREACH(block) {
652 653 654 655 656 657
        summary |= block->page_size;
    }

    return summary;
}

658
static void migration_bitmap_sync(RAMState *rs)
659 660 661
{
    RAMBlock *block;
    int64_t end_time;
662
    uint64_t bytes_xfer_now;
663

664
    rs->bitmap_sync_count++;
665

666 667
    if (!rs->bytes_xfer_prev) {
        rs->bytes_xfer_prev = ram_bytes_transferred();
668 669
    }

670 671
    if (!rs->time_last_bitmap_sync) {
        rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
672 673 674
    }

    trace_migration_bitmap_sync_start();
675
    memory_global_dirty_log_sync();
676

677
    qemu_mutex_lock(&rs->bitmap_mutex);
678
    rcu_read_lock();
P
Peter Xu 已提交
679
    RAMBLOCK_FOREACH(block) {
680
        migration_bitmap_sync_range(rs, block, 0, block->used_length);
681 682
    }
    rcu_read_unlock();
683
    qemu_mutex_unlock(&rs->bitmap_mutex);
684

685
    trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
686

687 688 689
    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);

    /* more than 1 second = 1000 millisecons */
690
    if (end_time > rs->time_last_bitmap_sync + 1000) {
691 692 693 694
        if (migrate_auto_converge()) {
            /* The following detection logic can be refined later. For now:
               Check to see if the dirtied bytes is 50% more than the approx.
               amount of bytes that just got transferred since the last time we
695 696
               were in this routine. If that happens twice, start or increase
               throttling */
697
            bytes_xfer_now = ram_bytes_transferred();
698

699
            if (rs->dirty_pages_rate &&
700
               (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
701
                   (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
702
               (rs->dirty_rate_high_cnt++ >= 2)) {
703
                    trace_migration_throttle();
704
                    rs->dirty_rate_high_cnt = 0;
705
                    mig_throttle_guest_down();
706
             }
707
             rs->bytes_xfer_prev = bytes_xfer_now;
708
        }
709

710
        if (migrate_use_xbzrle()) {
711
            if (rs->iterations_prev != rs->iterations) {
712
                rs->xbzrle_cache_miss_rate =
713
                   (double)(rs->xbzrle_cache_miss -
714
                            rs->xbzrle_cache_miss_prev) /
715
                   (rs->iterations - rs->iterations_prev);
716
            }
717
            rs->iterations_prev = rs->iterations;
718
            rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
719
        }
720
        rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
721 722
            / (end_time - rs->time_last_bitmap_sync);
        rs->time_last_bitmap_sync = end_time;
723
        rs->num_dirty_pages_period = 0;
724
    }
725
    if (migrate_use_events()) {
726
        qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
727
    }
728 729 730
}

/**
731
 * save_zero_page: send the zero page to the stream
732
 *
733
 * Returns the number of pages written.
734
 *
735
 * @rs: current RAM state
736 737 738 739
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 * @p: pointer to the page
 */
J
Juan Quintela 已提交
740 741
static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
                          uint8_t *p)
742 743 744 745
{
    int pages = -1;

    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
746
        rs->zero_pages++;
747
        rs->bytes_transferred +=
J
Juan Quintela 已提交
748
            save_page_header(rs, block, offset | RAM_SAVE_FLAG_COMPRESS);
J
Juan Quintela 已提交
749
        qemu_put_byte(rs->f, 0);
750
        rs->bytes_transferred += 1;
751 752 753 754 755 756
        pages = 1;
    }

    return pages;
}

757
static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
758
{
759
    if (!migrate_release_ram() || !migration_in_postcopy()) {
760 761 762
        return;
    }

763
    ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
764 765
}

766
/**
767
 * ram_save_page: send the given page to the stream
768
 *
769
 * Returns the number of pages written.
770 771 772
 *          < 0 - error
 *          >=0 - Number of pages written - this might legally be 0
 *                if xbzrle noticed the page was the same.
773
 *
J
Juan Quintela 已提交
774
 * @rs: current RAM state
775 776 777 778
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 * @last_stage: if we are at the completion stage
 */
779
static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
780 781 782 783 784 785 786
{
    int pages = -1;
    uint64_t bytes_xmit;
    ram_addr_t current_addr;
    uint8_t *p;
    int ret;
    bool send_async = true;
787
    RAMBlock *block = pss->block;
788
    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
789

790
    p = block->host + offset;
791
    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
792 793 794

    /* In doubt sent page as normal */
    bytes_xmit = 0;
J
Juan Quintela 已提交
795
    ret = ram_control_save_page(rs->f, block->offset,
796 797
                           offset, TARGET_PAGE_SIZE, &bytes_xmit);
    if (bytes_xmit) {
798
        rs->bytes_transferred += bytes_xmit;
799 800 801 802 803 804 805 806 807 808
        pages = 1;
    }

    XBZRLE_cache_lock();

    current_addr = block->offset + offset;

    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
        if (ret != RAM_SAVE_CONTROL_DELAYED) {
            if (bytes_xmit > 0) {
J
Juan Quintela 已提交
809
                rs->norm_pages++;
810
            } else if (bytes_xmit == 0) {
811
                rs->zero_pages++;
812 813 814
            }
        }
    } else {
J
Juan Quintela 已提交
815
        pages = save_zero_page(rs, block, offset, p);
816 817 818 819
        if (pages > 0) {
            /* Must let xbzrle know, otherwise a previous (now 0'd) cached
             * page would be stale
             */
J
Juan Quintela 已提交
820
            xbzrle_cache_zero_page(rs, current_addr);
821
            ram_release_pages(block->idstr, offset, pages);
J
Juan Quintela 已提交
822
        } else if (!rs->ram_bulk_stage &&
823
                   !migration_in_postcopy() && migrate_use_xbzrle()) {
J
Juan Quintela 已提交
824
            pages = save_xbzrle_page(rs, &p, current_addr, block,
825
                                     offset, last_stage);
826 827 828 829 830 831 832 833 834 835 836
            if (!last_stage) {
                /* Can't send this cached data async, since the cache page
                 * might get updated before it gets to the wire
                 */
                send_async = false;
            }
        }
    }

    /* XBZRLE overflow or normal page */
    if (pages == -1) {
J
Juan Quintela 已提交
837 838
        rs->bytes_transferred += save_page_header(rs, block,
                                                  offset | RAM_SAVE_FLAG_PAGE);
839
        if (send_async) {
J
Juan Quintela 已提交
840
            qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
841
                                  migrate_release_ram() &
842
                                  migration_in_postcopy());
843
        } else {
J
Juan Quintela 已提交
844
            qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
845
        }
846
        rs->bytes_transferred += TARGET_PAGE_SIZE;
847
        pages = 1;
J
Juan Quintela 已提交
848
        rs->norm_pages++;
849 850 851 852 853 854 855
    }

    XBZRLE_cache_unlock();

    return pages;
}

856 857
static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
                                ram_addr_t offset)
858
{
J
Juan Quintela 已提交
859
    RAMState *rs = &ram_state;
860
    int bytes_sent, blen;
861
    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
862

J
Juan Quintela 已提交
863
    bytes_sent = save_page_header(rs, block, offset |
864
                                  RAM_SAVE_FLAG_COMPRESS_PAGE);
865
    blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
866
                                     migrate_compress_level());
867 868 869 870 871 872
    if (blen < 0) {
        bytes_sent = 0;
        qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
        error_report("compressed data failed!");
    } else {
        bytes_sent += blen;
873
        ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
874
    }
875 876 877 878

    return bytes_sent;
}

J
Juan Quintela 已提交
879
static void flush_compressed_data(RAMState *rs)
880 881 882 883 884 885 886
{
    int idx, len, thread_count;

    if (!migrate_use_compression()) {
        return;
    }
    thread_count = migrate_compress_threads();
887

L
Liang Li 已提交
888
    qemu_mutex_lock(&comp_done_lock);
889
    for (idx = 0; idx < thread_count; idx++) {
890
        while (!comp_param[idx].done) {
L
Liang Li 已提交
891
            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
892
        }
893
    }
L
Liang Li 已提交
894
    qemu_mutex_unlock(&comp_done_lock);
895 896 897

    for (idx = 0; idx < thread_count; idx++) {
        qemu_mutex_lock(&comp_param[idx].mutex);
898
        if (!comp_param[idx].quit) {
J
Juan Quintela 已提交
899
            len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
900
            rs->bytes_transferred += len;
901
        }
902
        qemu_mutex_unlock(&comp_param[idx].mutex);
903 904 905 906 907 908 909 910 911 912
    }
}

static inline void set_compress_params(CompressParam *param, RAMBlock *block,
                                       ram_addr_t offset)
{
    param->block = block;
    param->offset = offset;
}

J
Juan Quintela 已提交
913 914
static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
                                           ram_addr_t offset)
915 916 917 918
{
    int idx, thread_count, bytes_xmit = -1, pages = -1;

    thread_count = migrate_compress_threads();
L
Liang Li 已提交
919
    qemu_mutex_lock(&comp_done_lock);
920 921 922
    while (true) {
        for (idx = 0; idx < thread_count; idx++) {
            if (comp_param[idx].done) {
923
                comp_param[idx].done = false;
J
Juan Quintela 已提交
924
                bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
925
                qemu_mutex_lock(&comp_param[idx].mutex);
926
                set_compress_params(&comp_param[idx], block, offset);
927 928
                qemu_cond_signal(&comp_param[idx].cond);
                qemu_mutex_unlock(&comp_param[idx].mutex);
929
                pages = 1;
J
Juan Quintela 已提交
930
                rs->norm_pages++;
931
                rs->bytes_transferred += bytes_xmit;
932 933 934 935 936 937
                break;
            }
        }
        if (pages > 0) {
            break;
        } else {
L
Liang Li 已提交
938
            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
939 940
        }
    }
L
Liang Li 已提交
941
    qemu_mutex_unlock(&comp_done_lock);
942 943 944 945 946 947 948

    return pages;
}

/**
 * ram_save_compressed_page: compress the given page and send it to the stream
 *
949
 * Returns the number of pages written.
950
 *
J
Juan Quintela 已提交
951
 * @rs: current RAM state
952 953 954 955
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 * @last_stage: if we are at the completion stage
 */
956 957
static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
                                    bool last_stage)
958 959
{
    int pages = -1;
960
    uint64_t bytes_xmit = 0;
961
    uint8_t *p;
962
    int ret, blen;
963
    RAMBlock *block = pss->block;
964
    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
965

966
    p = block->host + offset;
967

J
Juan Quintela 已提交
968
    ret = ram_control_save_page(rs->f, block->offset,
969 970
                                offset, TARGET_PAGE_SIZE, &bytes_xmit);
    if (bytes_xmit) {
971
        rs->bytes_transferred += bytes_xmit;
972 973 974 975 976
        pages = 1;
    }
    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
        if (ret != RAM_SAVE_CONTROL_DELAYED) {
            if (bytes_xmit > 0) {
J
Juan Quintela 已提交
977
                rs->norm_pages++;
978
            } else if (bytes_xmit == 0) {
979
                rs->zero_pages++;
980 981 982 983 984 985 986 987 988
            }
        }
    } else {
        /* When starting the process of a new block, the first page of
         * the block should be sent out before other pages in the same
         * block, and all the pages in last block should have been sent
         * out, keeping this order is important, because the 'cont' flag
         * is used to avoid resending the block name.
         */
J
Juan Quintela 已提交
989
        if (block != rs->last_sent_block) {
J
Juan Quintela 已提交
990 991
            flush_compressed_data(rs);
            pages = save_zero_page(rs, block, offset, p);
992
            if (pages == -1) {
993
                /* Make sure the first page is sent out before other pages */
J
Juan Quintela 已提交
994
                bytes_xmit = save_page_header(rs, block, offset |
995
                                              RAM_SAVE_FLAG_COMPRESS_PAGE);
J
Juan Quintela 已提交
996
                blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
997 998
                                                 migrate_compress_level());
                if (blen > 0) {
999
                    rs->bytes_transferred += bytes_xmit + blen;
J
Juan Quintela 已提交
1000
                    rs->norm_pages++;
1001
                    pages = 1;
1002
                } else {
J
Juan Quintela 已提交
1003
                    qemu_file_set_error(rs->f, blen);
1004
                    error_report("compressed data failed!");
1005
                }
1006
            }
1007
            if (pages > 0) {
1008
                ram_release_pages(block->idstr, offset, pages);
1009
            }
1010
        } else {
J
Juan Quintela 已提交
1011
            pages = save_zero_page(rs, block, offset, p);
1012
            if (pages == -1) {
J
Juan Quintela 已提交
1013
                pages = compress_page_with_multi_thread(rs, block, offset);
1014
            } else {
1015
                ram_release_pages(block->idstr, offset, pages);
1016 1017 1018 1019 1020 1021 1022
            }
        }
    }

    return pages;
}

1023 1024 1025
/**
 * find_dirty_block: find the next dirty page and update any state
 * associated with the search process.
1026
 *
1027
 * Returns if a page is found
1028
 *
J
Juan Quintela 已提交
1029
 * @rs: current RAM state
1030 1031
 * @pss: data about the state of the current dirty page scan
 * @again: set to false if the search has scanned the whole of RAM
1032
 */
1033
static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1034
{
1035
    pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
J
Juan Quintela 已提交
1036
    if (pss->complete_round && pss->block == rs->last_seen_block &&
1037
        pss->page >= rs->last_page) {
1038 1039 1040 1041 1042 1043 1044
        /*
         * We've been once around the RAM and haven't found anything.
         * Give up.
         */
        *again = false;
        return false;
    }
1045
    if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1046
        /* Didn't find anything in this RAM Block */
1047
        pss->page = 0;
1048 1049 1050 1051 1052 1053
        pss->block = QLIST_NEXT_RCU(pss->block, next);
        if (!pss->block) {
            /* Hit the end of the list */
            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
            /* Flag that we've looped */
            pss->complete_round = true;
J
Juan Quintela 已提交
1054
            rs->ram_bulk_stage = false;
1055 1056 1057 1058
            if (migrate_use_xbzrle()) {
                /* If xbzrle is on, stop using the data compression at this
                 * point. In theory, xbzrle can do better than compression.
                 */
J
Juan Quintela 已提交
1059
                flush_compressed_data(rs);
1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072
            }
        }
        /* Didn't find anything this time, but try again on the new block */
        *again = true;
        return false;
    } else {
        /* Can go around again, but... */
        *again = true;
        /* We've found something so probably don't need to */
        return true;
    }
}

1073 1074 1075
/**
 * unqueue_page: gets a page of the queue
 *
1076 1077
 * Helper for 'get_queued_page' - gets a page off the queue
 *
1078 1079
 * Returns the block of the page (or NULL if none available)
 *
1080
 * @rs: current RAM state
1081
 * @offset: used to return the offset within the RAMBlock
1082
 */
1083
static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1084 1085 1086
{
    RAMBlock *block = NULL;

1087 1088 1089 1090
    qemu_mutex_lock(&rs->src_page_req_mutex);
    if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
        struct RAMSrcPageRequest *entry =
                                QSIMPLEQ_FIRST(&rs->src_page_requests);
1091 1092 1093 1094 1095 1096 1097 1098
        block = entry->rb;
        *offset = entry->offset;

        if (entry->len > TARGET_PAGE_SIZE) {
            entry->len -= TARGET_PAGE_SIZE;
            entry->offset += TARGET_PAGE_SIZE;
        } else {
            memory_region_unref(block->mr);
1099
            QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1100 1101 1102
            g_free(entry);
        }
    }
1103
    qemu_mutex_unlock(&rs->src_page_req_mutex);
1104 1105 1106 1107

    return block;
}

1108 1109 1110 1111
/**
 * get_queued_page: unqueue a page from the postocpy requests
 *
 * Skips pages that are already sent (!dirty)
1112
 *
1113
 * Returns if a queued page is found
1114
 *
J
Juan Quintela 已提交
1115
 * @rs: current RAM state
1116
 * @pss: data about the state of the current dirty page scan
1117
 */
1118
static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1119 1120 1121 1122 1123 1124
{
    RAMBlock  *block;
    ram_addr_t offset;
    bool dirty;

    do {
1125
        block = unqueue_page(rs, &offset);
1126 1127 1128 1129 1130 1131 1132
        /*
         * We're sending this page, and since it's postcopy nothing else
         * will dirty it, and we must make sure it doesn't get sent again
         * even if this queue request was received after the background
         * search already sent it.
         */
        if (block) {
1133 1134
            unsigned long page;

1135 1136
            page = offset >> TARGET_PAGE_BITS;
            dirty = test_bit(page, block->bmap);
1137
            if (!dirty) {
1138
                trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1139
                       page, test_bit(page, block->unsentmap));
1140
            } else {
1141
                trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153
            }
        }

    } while (block && !dirty);

    if (block) {
        /*
         * As soon as we start servicing pages out of order, then we have
         * to kill the bulk stage, since the bulk stage assumes
         * in (migration_bitmap_find_and_reset_dirty) that every page is
         * dirty, that's no longer true.
         */
J
Juan Quintela 已提交
1154
        rs->ram_bulk_stage = false;
1155 1156 1157 1158 1159 1160 1161

        /*
         * We want the background search to continue from the queued page
         * since the guest is likely to want other pages near to the page
         * it just requested.
         */
        pss->block = block;
1162
        pss->page = offset >> TARGET_PAGE_BITS;
1163 1164 1165 1166 1167
    }

    return !!block;
}

1168
/**
1169 1170
 * migration_page_queue_free: drop any remaining pages in the ram
 * request queue
1171
 *
1172 1173 1174
 * It should be empty at the end anyway, but in error cases there may
 * be some left.  in case that there is any page left, we drop it.
 *
1175
 */
1176
void migration_page_queue_free(void)
1177
{
1178 1179
    struct RAMSrcPageRequest *mspr, *next_mspr;
    RAMState *rs = &ram_state;
1180 1181 1182 1183
    /* This queue generally should be empty - but in the case of a failed
     * migration might have some droppings in.
     */
    rcu_read_lock();
1184
    QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1185
        memory_region_unref(mspr->rb->mr);
1186
        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1187 1188 1189 1190 1191 1192
        g_free(mspr);
    }
    rcu_read_unlock();
}

/**
1193 1194 1195 1196 1197 1198 1199 1200 1201 1202
 * ram_save_queue_pages: queue the page for transmission
 *
 * A request from postcopy destination for example.
 *
 * Returns zero on success or negative on error
 *
 * @rbname: Name of the RAMBLock of the request. NULL means the
 *          same that last one.
 * @start: starting address from the start of the RAMBlock
 * @len: length (in bytes) to send
1203
 */
1204
int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1205 1206
{
    RAMBlock *ramblock;
1207
    RAMState *rs = &ram_state;
1208

1209
    rs->postcopy_requests++;
1210 1211 1212
    rcu_read_lock();
    if (!rbname) {
        /* Reuse last RAMBlock */
1213
        ramblock = rs->last_req_rb;
1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230

        if (!ramblock) {
            /*
             * Shouldn't happen, we can't reuse the last RAMBlock if
             * it's the 1st request.
             */
            error_report("ram_save_queue_pages no previous block");
            goto err;
        }
    } else {
        ramblock = qemu_ram_block_by_name(rbname);

        if (!ramblock) {
            /* We shouldn't be asked for a non-existent RAMBlock */
            error_report("ram_save_queue_pages no block '%s'", rbname);
            goto err;
        }
1231
        rs->last_req_rb = ramblock;
1232 1233 1234
    }
    trace_ram_save_queue_pages(ramblock->idstr, start, len);
    if (start+len > ramblock->used_length) {
1235 1236
        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1237 1238 1239 1240
                     __func__, start, len, ramblock->used_length);
        goto err;
    }

1241 1242
    struct RAMSrcPageRequest *new_entry =
        g_malloc0(sizeof(struct RAMSrcPageRequest));
1243 1244 1245 1246 1247
    new_entry->rb = ramblock;
    new_entry->offset = start;
    new_entry->len = len;

    memory_region_ref(ramblock->mr);
1248 1249 1250
    qemu_mutex_lock(&rs->src_page_req_mutex);
    QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
    qemu_mutex_unlock(&rs->src_page_req_mutex);
1251 1252 1253 1254 1255 1256 1257 1258 1259
    rcu_read_unlock();

    return 0;

err:
    rcu_read_unlock();
    return -1;
}

1260
/**
1261
 * ram_save_target_page: save one target page
1262
 *
1263
 * Returns the number of pages written
1264
 *
J
Juan Quintela 已提交
1265
 * @rs: current RAM state
1266 1267
 * @ms: current migration state
 * @pss: data about the page we want to send
1268 1269
 * @last_stage: if we are at the completion stage
 */
1270
static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1271
                                bool last_stage)
1272 1273 1274 1275
{
    int res = 0;

    /* Check the pages is dirty and if it is send it */
1276
    if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1277 1278 1279 1280 1281
        /*
         * If xbzrle is on, stop using the data compression after first
         * round of migration even if compression is enabled. In theory,
         * xbzrle can do better than compression.
         */
1282 1283
        if (migrate_use_compression() &&
            (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1284
            res = ram_save_compressed_page(rs, pss, last_stage);
1285
        } else {
1286
            res = ram_save_page(rs, pss, last_stage);
1287 1288 1289 1290 1291
        }

        if (res < 0) {
            return res;
        }
1292 1293
        if (pss->block->unsentmap) {
            clear_bit(pss->page, pss->block->unsentmap);
1294 1295 1296 1297 1298 1299 1300
        }
    }

    return res;
}

/**
1301
 * ram_save_host_page: save a whole host page
1302
 *
1303 1304 1305 1306 1307
 * Starting at *offset send pages up to the end of the current host
 * page. It's valid for the initial offset to point into the middle of
 * a host page in which case the remainder of the hostpage is sent.
 * Only dirty target pages are sent. Note that the host page size may
 * be a huge page for this block.
1308
 *
1309 1310
 * Returns the number of pages written or negative on error
 *
J
Juan Quintela 已提交
1311
 * @rs: current RAM state
1312 1313
 * @ms: current migration state
 * @pss: data about the page we want to send
1314 1315
 * @last_stage: if we are at the completion stage
 */
1316
static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1317
                              bool last_stage)
1318 1319
{
    int tmppages, pages = 0;
1320 1321
    size_t pagesize_bits =
        qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1322

1323
    do {
1324
        tmppages = ram_save_target_page(rs, pss, last_stage);
1325 1326 1327 1328 1329
        if (tmppages < 0) {
            return tmppages;
        }

        pages += tmppages;
1330 1331
        pss->page++;
    } while (pss->page & (pagesize_bits - 1));
1332 1333

    /* The offset we leave with is the last one we looked at */
1334
    pss->page--;
1335 1336
    return pages;
}
1337

1338
/**
1339
 * ram_find_and_save_block: finds a dirty page and sends it to f
1340 1341 1342
 *
 * Called within an RCU critical section.
 *
1343
 * Returns the number of pages written where zero means no dirty pages
1344
 *
J
Juan Quintela 已提交
1345
 * @rs: current RAM state
1346
 * @last_stage: if we are at the completion stage
1347 1348 1349
 *
 * On systems where host-page-size > target-page-size it will send all the
 * pages in a host page that are dirty.
1350 1351
 */

J
Juan Quintela 已提交
1352
static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1353
{
1354
    PageSearchStatus pss;
1355
    int pages = 0;
1356
    bool again, found;
1357

1358 1359 1360 1361 1362
    /* No dirty page as there is zero RAM */
    if (!ram_bytes_total()) {
        return pages;
    }

J
Juan Quintela 已提交
1363
    pss.block = rs->last_seen_block;
1364
    pss.page = rs->last_page;
1365 1366 1367 1368 1369
    pss.complete_round = false;

    if (!pss.block) {
        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
    }
1370

1371
    do {
1372
        again = true;
1373
        found = get_queued_page(rs, &pss);
1374

1375 1376
        if (!found) {
            /* priority queue empty, so just search for something dirty */
1377
            found = find_dirty_block(rs, &pss, &again);
1378
        }
1379

1380
        if (found) {
1381
            pages = ram_save_host_page(rs, &pss, last_stage);
1382
        }
1383
    } while (!pages && again);
1384

J
Juan Quintela 已提交
1385
    rs->last_seen_block = pss.block;
1386
    rs->last_page = pss.page;
1387 1388 1389 1390 1391 1392 1393

    return pages;
}

void acct_update_position(QEMUFile *f, size_t size, bool zero)
{
    uint64_t pages = size / TARGET_PAGE_SIZE;
1394 1395
    RAMState *rs = &ram_state;

1396
    if (zero) {
1397
        rs->zero_pages += pages;
1398
    } else {
J
Juan Quintela 已提交
1399
        rs->norm_pages += pages;
1400
        rs->bytes_transferred += size;
1401 1402 1403 1404 1405 1406 1407 1408 1409 1410
        qemu_update_position(f, size);
    }
}

uint64_t ram_bytes_total(void)
{
    RAMBlock *block;
    uint64_t total = 0;

    rcu_read_lock();
P
Peter Xu 已提交
1411
    RAMBLOCK_FOREACH(block) {
1412
        total += block->used_length;
P
Peter Xu 已提交
1413
    }
1414 1415 1416 1417 1418 1419 1420 1421 1422 1423
    rcu_read_unlock();
    return total;
}

void free_xbzrle_decoded_buf(void)
{
    g_free(xbzrle_decoded_buf);
    xbzrle_decoded_buf = NULL;
}

L
Liang Li 已提交
1424
static void ram_migration_cleanup(void *opaque)
1425
{
1426
    RAMBlock *block;
1427

L
Li Zhijian 已提交
1428 1429 1430
    /* caller have hold iothread lock or is in a bh, so there is
     * no writing race against this migration_bitmap
     */
1431 1432 1433 1434 1435 1436 1437
    memory_global_dirty_log_stop();

    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
        g_free(block->bmap);
        block->bmap = NULL;
        g_free(block->unsentmap);
        block->unsentmap = NULL;
1438 1439 1440 1441 1442 1443 1444
    }

    XBZRLE_cache_lock();
    if (XBZRLE.cache) {
        cache_fini(XBZRLE.cache);
        g_free(XBZRLE.encoded_buf);
        g_free(XBZRLE.current_buf);
1445
        g_free(ZERO_TARGET_PAGE);
1446 1447 1448 1449 1450 1451 1452
        XBZRLE.cache = NULL;
        XBZRLE.encoded_buf = NULL;
        XBZRLE.current_buf = NULL;
    }
    XBZRLE_cache_unlock();
}

J
Juan Quintela 已提交
1453
static void ram_state_reset(RAMState *rs)
1454
{
J
Juan Quintela 已提交
1455 1456
    rs->last_seen_block = NULL;
    rs->last_sent_block = NULL;
1457
    rs->last_page = 0;
J
Juan Quintela 已提交
1458 1459
    rs->last_version = ram_list.version;
    rs->ram_bulk_stage = true;
1460 1461 1462 1463
}

#define MAX_WAIT 50 /* ms, half buffered_file limit */

1464 1465 1466 1467 1468
/*
 * 'expected' is the value you expect the bitmap mostly to be full
 * of; it won't bother printing lines that are all this value.
 * If 'todump' is null the migration bitmap is dumped.
 */
1469 1470
void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
                           unsigned long pages)
1471 1472 1473 1474 1475
{
    int64_t cur;
    int64_t linelen = 128;
    char linebuf[129];

1476
    for (cur = 0; cur < pages; cur += linelen) {
1477 1478 1479 1480 1481 1482
        int64_t curb;
        bool found = false;
        /*
         * Last line; catch the case where the line length
         * is longer than remaining ram
         */
1483 1484
        if (cur + linelen > pages) {
            linelen = pages - cur;
1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497
        }
        for (curb = 0; curb < linelen; curb++) {
            bool thisbit = test_bit(cur + curb, todump);
            linebuf[curb] = thisbit ? '1' : '.';
            found = found || (thisbit != expected);
        }
        if (found) {
            linebuf[curb] = '\0';
            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
        }
    }
}

1498 1499
/* **** functions for postcopy ***** */

1500 1501 1502 1503
void ram_postcopy_migrated_memory_release(MigrationState *ms)
{
    struct RAMBlock *block;

P
Peter Xu 已提交
1504
    RAMBLOCK_FOREACH(block) {
1505 1506 1507
        unsigned long *bitmap = block->bmap;
        unsigned long range = block->used_length >> TARGET_PAGE_BITS;
        unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1508 1509 1510

        while (run_start < range) {
            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1511
            ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1512 1513 1514 1515 1516 1517
                              (run_end - run_start) << TARGET_PAGE_BITS);
            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
        }
    }
}

1518 1519 1520 1521 1522
/**
 * postcopy_send_discard_bm_ram: discard a RAMBlock
 *
 * Returns zero on success
 *
1523 1524 1525
 * Callback from postcopy_each_ram_send_discard for each RAMBlock
 * Note: At this point the 'unsentmap' is the processed bitmap combined
 *       with the dirtymap; so a '1' means it's either dirty or unsent.
1526 1527 1528 1529 1530
 *
 * @ms: current migration state
 * @pds: state for postcopy
 * @start: RAMBlock starting page
 * @length: RAMBlock size
1531 1532 1533
 */
static int postcopy_send_discard_bm_ram(MigrationState *ms,
                                        PostcopyDiscardState *pds,
1534
                                        RAMBlock *block)
1535
{
1536
    unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1537
    unsigned long current;
1538
    unsigned long *unsentmap = block->unsentmap;
1539

1540
    for (current = 0; current < end; ) {
1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551
        unsigned long one = find_next_bit(unsentmap, end, current);

        if (one <= end) {
            unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
            unsigned long discard_length;

            if (zero >= end) {
                discard_length = end - one;
            } else {
                discard_length = zero - one;
            }
1552 1553 1554
            if (discard_length) {
                postcopy_discard_send_range(ms, pds, one, discard_length);
            }
1555 1556 1557 1558 1559 1560 1561 1562 1563
            current = one + discard_length;
        } else {
            current = one;
        }
    }

    return 0;
}

1564 1565 1566 1567 1568
/**
 * postcopy_each_ram_send_discard: discard all RAMBlocks
 *
 * Returns 0 for success or negative for error
 *
1569 1570 1571 1572 1573
 * Utility for the outgoing postcopy code.
 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
 *   passing it bitmap indexes and name.
 * (qemu_ram_foreach_block ends up passing unscaled lengths
 *  which would mean postcopy code would have to deal with target page)
1574 1575
 *
 * @ms: current migration state
1576 1577 1578 1579 1580 1581
 */
static int postcopy_each_ram_send_discard(MigrationState *ms)
{
    struct RAMBlock *block;
    int ret;

P
Peter Xu 已提交
1582
    RAMBLOCK_FOREACH(block) {
1583 1584
        PostcopyDiscardState *pds =
            postcopy_discard_send_init(ms, block->idstr);
1585 1586 1587 1588 1589 1590

        /*
         * Postcopy sends chunks of bitmap over the wire, but it
         * just needs indexes at this point, avoids it having
         * target page specific code.
         */
1591
        ret = postcopy_send_discard_bm_ram(ms, pds, block);
1592 1593 1594 1595 1596 1597 1598 1599 1600
        postcopy_discard_send_finish(ms, pds);
        if (ret) {
            return ret;
        }
    }

    return 0;
}

1601 1602 1603 1604 1605 1606
/**
 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
 *
 * Helper for postcopy_chunk_hostpages; it's called twice to
 * canonicalize the two bitmaps, that are similar, but one is
 * inverted.
1607
 *
1608 1609
 * Postcopy requires that all target pages in a hostpage are dirty or
 * clean, not a mix.  This function canonicalizes the bitmaps.
1610
 *
1611 1612 1613 1614 1615
 * @ms: current migration state
 * @unsent_pass: if true we need to canonicalize partially unsent host pages
 *               otherwise we need to canonicalize partially dirty host pages
 * @block: block that contains the page we want to canonicalize
 * @pds: state for postcopy
1616 1617 1618 1619 1620
 */
static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
                                          RAMBlock *block,
                                          PostcopyDiscardState *pds)
{
1621
    RAMState *rs = &ram_state;
1622 1623
    unsigned long *bitmap = block->bmap;
    unsigned long *unsentmap = block->unsentmap;
1624
    unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1625
    unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1626 1627
    unsigned long run_start;

1628 1629 1630 1631 1632
    if (block->page_size == TARGET_PAGE_SIZE) {
        /* Easy case - TPS==HPS for a non-huge page RAMBlock */
        return;
    }

1633 1634
    if (unsent_pass) {
        /* Find a sent page */
1635
        run_start = find_next_zero_bit(unsentmap, pages, 0);
1636 1637
    } else {
        /* Find a dirty page */
1638
        run_start = find_next_bit(bitmap, pages, 0);
1639 1640
    }

1641
    while (run_start < pages) {
1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660
        bool do_fixup = false;
        unsigned long fixup_start_addr;
        unsigned long host_offset;

        /*
         * If the start of this run of pages is in the middle of a host
         * page, then we need to fixup this host page.
         */
        host_offset = run_start % host_ratio;
        if (host_offset) {
            do_fixup = true;
            run_start -= host_offset;
            fixup_start_addr = run_start;
            /* For the next pass */
            run_start = run_start + host_ratio;
        } else {
            /* Find the end of this run */
            unsigned long run_end;
            if (unsent_pass) {
1661
                run_end = find_next_bit(unsentmap, pages, run_start + 1);
1662
            } else {
1663
                run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713
            }
            /*
             * If the end isn't at the start of a host page, then the
             * run doesn't finish at the end of a host page
             * and we need to discard.
             */
            host_offset = run_end % host_ratio;
            if (host_offset) {
                do_fixup = true;
                fixup_start_addr = run_end - host_offset;
                /*
                 * This host page has gone, the next loop iteration starts
                 * from after the fixup
                 */
                run_start = fixup_start_addr + host_ratio;
            } else {
                /*
                 * No discards on this iteration, next loop starts from
                 * next sent/dirty page
                 */
                run_start = run_end + 1;
            }
        }

        if (do_fixup) {
            unsigned long page;

            /* Tell the destination to discard this page */
            if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
                /* For the unsent_pass we:
                 *     discard partially sent pages
                 * For the !unsent_pass (dirty) we:
                 *     discard partially dirty pages that were sent
                 *     (any partially sent pages were already discarded
                 *     by the previous unsent_pass)
                 */
                postcopy_discard_send_range(ms, pds, fixup_start_addr,
                                            host_ratio);
            }

            /* Clean up the bitmap */
            for (page = fixup_start_addr;
                 page < fixup_start_addr + host_ratio; page++) {
                /* All pages in this host page are now not sent */
                set_bit(page, unsentmap);

                /*
                 * Remark them as dirty, updating the count for any pages
                 * that weren't previously dirty.
                 */
1714
                rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1715 1716 1717 1718 1719
            }
        }

        if (unsent_pass) {
            /* Find the next sent page for the next iteration */
1720
            run_start = find_next_zero_bit(unsentmap, pages, run_start);
1721 1722
        } else {
            /* Find the next dirty page for the next iteration */
1723
            run_start = find_next_bit(bitmap, pages, run_start);
1724 1725 1726 1727
        }
    }
}

1728 1729 1730
/**
 * postcopy_chuck_hostpages: discrad any partially sent host page
 *
1731 1732 1733
 * Utility for the outgoing postcopy code.
 *
 * Discard any partially sent host-page size chunks, mark any partially
1734 1735
 * dirty host-page size chunks as all dirty.  In this case the host-page
 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1736
 *
1737 1738 1739
 * Returns zero on success
 *
 * @ms: current migration state
1740
 * @block: block we want to work with
1741
 */
1742
static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1743
{
1744 1745
    PostcopyDiscardState *pds =
        postcopy_discard_send_init(ms, block->idstr);
1746

1747 1748 1749 1750 1751 1752 1753
    /* First pass: Discard all partially sent host pages */
    postcopy_chunk_hostpages_pass(ms, true, block, pds);
    /*
     * Second pass: Ensure that all partially dirty host pages are made
     * fully dirty.
     */
    postcopy_chunk_hostpages_pass(ms, false, block, pds);
1754

1755
    postcopy_discard_send_finish(ms, pds);
1756 1757 1758
    return 0;
}

1759 1760 1761 1762 1763
/**
 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
 *
 * Returns zero on success
 *
1764 1765 1766 1767 1768 1769 1770
 * Transmit the set of pages to be discarded after precopy to the target
 * these are pages that:
 *     a) Have been previously transmitted but are now dirty again
 *     b) Pages that have never been transmitted, this ensures that
 *        any pages on the destination that have been mapped by background
 *        tasks get discarded (transparent huge pages is the specific concern)
 * Hopefully this is pretty sparse
1771 1772
 *
 * @ms: current migration state
1773 1774 1775
 */
int ram_postcopy_send_discard_bitmap(MigrationState *ms)
{
1776
    RAMState *rs = &ram_state;
1777
    RAMBlock *block;
1778 1779 1780 1781 1782
    int ret;

    rcu_read_lock();

    /* This should be our last sync, the src is now paused */
1783
    migration_bitmap_sync(rs);
1784

1785 1786 1787 1788
    /* Easiest way to make sure we don't resume in the middle of a host-page */
    rs->last_seen_block = NULL;
    rs->last_sent_block = NULL;
    rs->last_page = 0;
1789

1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
        unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
        unsigned long *bitmap = block->bmap;
        unsigned long *unsentmap = block->unsentmap;

        if (!unsentmap) {
            /* We don't have a safe way to resize the sentmap, so
             * if the bitmap was resized it will be NULL at this
             * point.
             */
            error_report("migration ram resized during precopy phase");
            rcu_read_unlock();
            return -EINVAL;
        }
        /* Deal with TPS != HPS and huge pages */
        ret = postcopy_chunk_hostpages(ms, block);
        if (ret) {
            rcu_read_unlock();
            return ret;
        }
1810

1811 1812 1813 1814
        /*
         * Update the unsentmap to be unsentmap = unsentmap | dirty
         */
        bitmap_or(unsentmap, unsentmap, bitmap, pages);
1815
#ifdef DEBUG_POSTCOPY
1816
        ram_debug_dump_bitmap(unsentmap, true, pages);
1817
#endif
1818 1819
    }
    trace_ram_postcopy_send_discard_bitmap();
1820 1821 1822 1823 1824 1825 1826

    ret = postcopy_each_ram_send_discard(ms);
    rcu_read_unlock();

    return ret;
}

1827 1828
/**
 * ram_discard_range: discard dirtied pages at the beginning of postcopy
1829
 *
1830
 * Returns zero on success
1831
 *
J
Juan Quintela 已提交
1832 1833
 * @rbname: name of the RAMBlock of the request. NULL means the
 *          same that last one.
1834 1835
 * @start: RAMBlock starting page
 * @length: RAMBlock size
1836
 */
1837
int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1838 1839 1840
{
    int ret = -1;

J
Juan Quintela 已提交
1841
    trace_ram_discard_range(rbname, start, length);
1842

1843
    rcu_read_lock();
J
Juan Quintela 已提交
1844
    RAMBlock *rb = qemu_ram_block_by_name(rbname);
1845 1846

    if (!rb) {
J
Juan Quintela 已提交
1847
        error_report("ram_discard_range: Failed to find block '%s'", rbname);
1848 1849 1850
        goto err;
    }

1851
    ret = ram_block_discard_range(rb, start, length);
1852 1853 1854 1855 1856 1857 1858

err:
    rcu_read_unlock();

    return ret;
}

1859
static int ram_state_init(RAMState *rs)
1860
{
1861
    memset(rs, 0, sizeof(*rs));
1862
    qemu_mutex_init(&rs->bitmap_mutex);
1863 1864
    qemu_mutex_init(&rs->src_page_req_mutex);
    QSIMPLEQ_INIT(&rs->src_page_requests);
1865 1866 1867

    if (migrate_use_xbzrle()) {
        XBZRLE_cache_lock();
1868
        ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894
        XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
                                  TARGET_PAGE_SIZE,
                                  TARGET_PAGE_SIZE);
        if (!XBZRLE.cache) {
            XBZRLE_cache_unlock();
            error_report("Error creating cache");
            return -1;
        }
        XBZRLE_cache_unlock();

        /* We prefer not to abort if there is no memory */
        XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
        if (!XBZRLE.encoded_buf) {
            error_report("Error allocating encoded_buf");
            return -1;
        }

        XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
        if (!XBZRLE.current_buf) {
            error_report("Error allocating current_buf");
            g_free(XBZRLE.encoded_buf);
            XBZRLE.encoded_buf = NULL;
            return -1;
        }
    }

1895 1896 1897
    /* For memory_global_dirty_log_start below.  */
    qemu_mutex_lock_iothread();

1898 1899
    qemu_mutex_lock_ramlist();
    rcu_read_lock();
J
Juan Quintela 已提交
1900
    ram_state_reset(rs);
1901

1902 1903
    /* Skip setting bitmap if there is no RAM */
    if (ram_bytes_total()) {
1904 1905 1906 1907
        RAMBlock *block;

        QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
            unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1908

1909 1910 1911 1912 1913 1914
            block->bmap = bitmap_new(pages);
            bitmap_set(block->bmap, 0, pages);
            if (migrate_postcopy_ram()) {
                block->unsentmap = bitmap_new(pages);
                bitmap_set(block->unsentmap, 0, pages);
            }
1915
        }
1916 1917
    }

1918 1919 1920 1921
    /*
     * Count the total number of pages used by ram blocks not including any
     * gaps due to alignment or unplugs.
     */
1922
    rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1923 1924

    memory_global_dirty_log_start();
1925
    migration_bitmap_sync(rs);
1926
    qemu_mutex_unlock_ramlist();
1927
    qemu_mutex_unlock_iothread();
1928 1929 1930 1931 1932
    rcu_read_unlock();

    return 0;
}

1933 1934
/*
 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1935 1936 1937 1938 1939
 * long-running RCU critical section.  When rcu-reclaims in the code
 * start to become numerous it will be necessary to reduce the
 * granularity of these critical sections.
 */

1940 1941 1942 1943 1944 1945 1946 1947
/**
 * ram_save_setup: Setup RAM for migration
 *
 * Returns zero to indicate success and negative for error
 *
 * @f: QEMUFile where to send the data
 * @opaque: RAMState pointer
 */
1948 1949
static int ram_save_setup(QEMUFile *f, void *opaque)
{
J
Juan Quintela 已提交
1950
    RAMState *rs = opaque;
1951 1952 1953 1954
    RAMBlock *block;

    /* migration has already setup the bitmap, reuse it. */
    if (!migration_in_colo_state()) {
1955
        if (ram_state_init(rs) < 0) {
1956 1957 1958
            return -1;
         }
    }
J
Juan Quintela 已提交
1959
    rs->f = f;
1960 1961

    rcu_read_lock();
1962 1963 1964

    qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);

P
Peter Xu 已提交
1965
    RAMBLOCK_FOREACH(block) {
1966 1967 1968
        qemu_put_byte(f, strlen(block->idstr));
        qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
        qemu_put_be64(f, block->used_length);
1969 1970 1971
        if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
            qemu_put_be64(f, block->page_size);
        }
1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983
    }

    rcu_read_unlock();

    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
    ram_control_after_iterate(f, RAM_CONTROL_SETUP);

    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

    return 0;
}

1984 1985 1986 1987 1988 1989 1990 1991
/**
 * ram_save_iterate: iterative stage for migration
 *
 * Returns zero to indicate success and negative for error
 *
 * @f: QEMUFile where to send the data
 * @opaque: RAMState pointer
 */
1992 1993
static int ram_save_iterate(QEMUFile *f, void *opaque)
{
J
Juan Quintela 已提交
1994
    RAMState *rs = opaque;
1995 1996 1997
    int ret;
    int i;
    int64_t t0;
1998
    int done = 0;
1999 2000

    rcu_read_lock();
J
Juan Quintela 已提交
2001 2002
    if (ram_list.version != rs->last_version) {
        ram_state_reset(rs);
2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
    }

    /* Read version before ram_list.blocks */
    smp_rmb();

    ram_control_before_iterate(f, RAM_CONTROL_ROUND);

    t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
    i = 0;
    while ((ret = qemu_file_rate_limit(f)) == 0) {
        int pages;

J
Juan Quintela 已提交
2015
        pages = ram_find_and_save_block(rs, false);
2016 2017
        /* no more pages to sent */
        if (pages == 0) {
2018
            done = 1;
2019 2020
            break;
        }
2021
        rs->iterations++;
2022

2023 2024 2025 2026 2027 2028 2029 2030
        /* we want to check in the 1st loop, just in case it was the 1st time
           and we had to sync the dirty bitmap.
           qemu_get_clock_ns() is a bit expensive, so we only check each some
           iterations
        */
        if ((i & 63) == 0) {
            uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
            if (t1 > MAX_WAIT) {
2031
                trace_ram_save_iterate_big_wait(t1, i);
2032 2033 2034 2035 2036
                break;
            }
        }
        i++;
    }
J
Juan Quintela 已提交
2037
    flush_compressed_data(rs);
2038 2039 2040 2041 2042 2043 2044 2045 2046
    rcu_read_unlock();

    /*
     * Must occur before EOS (or any QEMUFile operation)
     * because of RDMA protocol.
     */
    ram_control_after_iterate(f, RAM_CONTROL_ROUND);

    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2047
    rs->bytes_transferred += 8;
2048 2049 2050 2051 2052 2053

    ret = qemu_file_get_error(f);
    if (ret < 0) {
        return ret;
    }

2054
    return done;
2055 2056
}

2057 2058 2059 2060 2061 2062 2063 2064 2065 2066
/**
 * ram_save_complete: function called to send the remaining amount of ram
 *
 * Returns zero to indicate success
 *
 * Called with iothread lock
 *
 * @f: QEMUFile where to send the data
 * @opaque: RAMState pointer
 */
2067 2068
static int ram_save_complete(QEMUFile *f, void *opaque)
{
J
Juan Quintela 已提交
2069 2070
    RAMState *rs = opaque;

2071 2072
    rcu_read_lock();

2073
    if (!migration_in_postcopy()) {
2074
        migration_bitmap_sync(rs);
2075
    }
2076 2077 2078 2079 2080 2081 2082 2083 2084

    ram_control_before_iterate(f, RAM_CONTROL_FINISH);

    /* try transferring iterative blocks of memory */

    /* flush all remaining blocks regardless of rate limiting */
    while (true) {
        int pages;

J
Juan Quintela 已提交
2085
        pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2086 2087 2088 2089 2090 2091
        /* no more blocks to sent */
        if (pages == 0) {
            break;
        }
    }

J
Juan Quintela 已提交
2092
    flush_compressed_data(rs);
2093 2094 2095
    ram_control_after_iterate(f, RAM_CONTROL_FINISH);

    rcu_read_unlock();
P
Paolo Bonzini 已提交
2096

2097 2098 2099 2100 2101
    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

    return 0;
}

2102 2103 2104
static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
                             uint64_t *non_postcopiable_pending,
                             uint64_t *postcopiable_pending)
2105
{
2106
    RAMState *rs = opaque;
2107 2108
    uint64_t remaining_size;

J
Juan Quintela 已提交
2109
    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2110

2111
    if (!migration_in_postcopy() &&
2112
        remaining_size < max_size) {
2113 2114
        qemu_mutex_lock_iothread();
        rcu_read_lock();
2115
        migration_bitmap_sync(rs);
2116 2117
        rcu_read_unlock();
        qemu_mutex_unlock_iothread();
J
Juan Quintela 已提交
2118
        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2119
    }
2120 2121 2122

    /* We can do postcopy, and all the data is postcopiable */
    *postcopiable_pending += remaining_size;
2123 2124 2125 2126 2127 2128
}

static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
{
    unsigned int xh_len;
    int xh_flags;
2129
    uint8_t *loaded_data;
2130 2131 2132 2133

    if (!xbzrle_decoded_buf) {
        xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
    }
2134
    loaded_data = xbzrle_decoded_buf;
2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149

    /* extract RLE header */
    xh_flags = qemu_get_byte(f);
    xh_len = qemu_get_be16(f);

    if (xh_flags != ENCODING_FLAG_XBZRLE) {
        error_report("Failed to load XBZRLE page - wrong compression!");
        return -1;
    }

    if (xh_len > TARGET_PAGE_SIZE) {
        error_report("Failed to load XBZRLE page - len overflow!");
        return -1;
    }
    /* load data and decode */
2150
    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2151 2152

    /* decode RLE */
2153
    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2154 2155 2156 2157 2158 2159 2160 2161
                             TARGET_PAGE_SIZE) == -1) {
        error_report("Failed to load XBZRLE page - decode error!");
        return -1;
    }

    return 0;
}

2162 2163 2164 2165 2166
/**
 * ram_block_from_stream: read a RAMBlock id from the migration stream
 *
 * Must be called from within a rcu critical section.
 *
2167
 * Returns a pointer from within the RCU-protected ram_list.
2168
 *
2169 2170
 * @f: QEMUFile where to read the data from
 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2171
 */
2172
static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2173 2174 2175 2176 2177 2178
{
    static RAMBlock *block = NULL;
    char id[256];
    uint8_t len;

    if (flags & RAM_SAVE_FLAG_CONTINUE) {
2179
        if (!block) {
2180 2181 2182
            error_report("Ack, bad migration stream!");
            return NULL;
        }
2183
        return block;
2184 2185 2186 2187 2188 2189
    }

    len = qemu_get_byte(f);
    qemu_get_buffer(f, (uint8_t *)id, len);
    id[len] = 0;

D
Dr. David Alan Gilbert 已提交
2190
    block = qemu_ram_block_by_name(id);
2191 2192 2193
    if (!block) {
        error_report("Can't find block %s", id);
        return NULL;
2194 2195
    }

2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206
    return block;
}

static inline void *host_from_ram_block_offset(RAMBlock *block,
                                               ram_addr_t offset)
{
    if (!offset_in_ramblock(block, offset)) {
        return NULL;
    }

    return block->host + offset;
2207 2208
}

2209 2210 2211
/**
 * ram_handle_compressed: handle the zero page case
 *
2212 2213
 * If a page (or a whole RDMA chunk) has been
 * determined to be zero, then zap it.
2214 2215 2216 2217
 *
 * @host: host address for the zero page
 * @ch: what the page is filled from.  We only support zero
 * @size: size of the zero page
2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229
 */
void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
{
    if (ch != 0 || !is_zero_range(host, size)) {
        memset(host, ch, size);
    }
}

static void *do_data_decompress(void *opaque)
{
    DecompressParam *param = opaque;
    unsigned long pagesize;
2230 2231
    uint8_t *des;
    int len;
2232

2233
    qemu_mutex_lock(&param->mutex);
2234
    while (!param->quit) {
2235 2236 2237 2238 2239 2240
        if (param->des) {
            des = param->des;
            len = param->len;
            param->des = 0;
            qemu_mutex_unlock(&param->mutex);

2241
            pagesize = TARGET_PAGE_SIZE;
2242 2243 2244 2245 2246
            /* uncompress() will return failed in some case, especially
             * when the page is dirted when doing the compression, it's
             * not a problem because the dirty page will be retransferred
             * and uncompress() won't break the data in other pages.
             */
2247 2248
            uncompress((Bytef *)des, &pagesize,
                       (const Bytef *)param->compbuf, len);
2249

2250 2251 2252 2253 2254 2255 2256 2257 2258
            qemu_mutex_lock(&decomp_done_lock);
            param->done = true;
            qemu_cond_signal(&decomp_done_cond);
            qemu_mutex_unlock(&decomp_done_lock);

            qemu_mutex_lock(&param->mutex);
        } else {
            qemu_cond_wait(&param->cond, &param->mutex);
        }
2259
    }
2260
    qemu_mutex_unlock(&param->mutex);
2261 2262 2263 2264

    return NULL;
}

L
Liang Li 已提交
2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282
static void wait_for_decompress_done(void)
{
    int idx, thread_count;

    if (!migrate_use_compression()) {
        return;
    }

    thread_count = migrate_decompress_threads();
    qemu_mutex_lock(&decomp_done_lock);
    for (idx = 0; idx < thread_count; idx++) {
        while (!decomp_param[idx].done) {
            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
        }
    }
    qemu_mutex_unlock(&decomp_done_lock);
}

2283 2284 2285 2286 2287 2288 2289
void migrate_decompress_threads_create(void)
{
    int i, thread_count;

    thread_count = migrate_decompress_threads();
    decompress_threads = g_new0(QemuThread, thread_count);
    decomp_param = g_new0(DecompressParam, thread_count);
2290 2291
    qemu_mutex_init(&decomp_done_lock);
    qemu_cond_init(&decomp_done_cond);
2292 2293 2294 2295
    for (i = 0; i < thread_count; i++) {
        qemu_mutex_init(&decomp_param[i].mutex);
        qemu_cond_init(&decomp_param[i].cond);
        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2296
        decomp_param[i].done = true;
2297
        decomp_param[i].quit = false;
2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310
        qemu_thread_create(decompress_threads + i, "decompress",
                           do_data_decompress, decomp_param + i,
                           QEMU_THREAD_JOINABLE);
    }
}

void migrate_decompress_threads_join(void)
{
    int i, thread_count;

    thread_count = migrate_decompress_threads();
    for (i = 0; i < thread_count; i++) {
        qemu_mutex_lock(&decomp_param[i].mutex);
2311
        decomp_param[i].quit = true;
2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326
        qemu_cond_signal(&decomp_param[i].cond);
        qemu_mutex_unlock(&decomp_param[i].mutex);
    }
    for (i = 0; i < thread_count; i++) {
        qemu_thread_join(decompress_threads + i);
        qemu_mutex_destroy(&decomp_param[i].mutex);
        qemu_cond_destroy(&decomp_param[i].cond);
        g_free(decomp_param[i].compbuf);
    }
    g_free(decompress_threads);
    g_free(decomp_param);
    decompress_threads = NULL;
    decomp_param = NULL;
}

2327
static void decompress_data_with_multi_threads(QEMUFile *f,
2328 2329 2330 2331 2332
                                               void *host, int len)
{
    int idx, thread_count;

    thread_count = migrate_decompress_threads();
2333
    qemu_mutex_lock(&decomp_done_lock);
2334 2335
    while (true) {
        for (idx = 0; idx < thread_count; idx++) {
2336
            if (decomp_param[idx].done) {
2337 2338
                decomp_param[idx].done = false;
                qemu_mutex_lock(&decomp_param[idx].mutex);
2339
                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2340 2341
                decomp_param[idx].des = host;
                decomp_param[idx].len = len;
2342 2343
                qemu_cond_signal(&decomp_param[idx].cond);
                qemu_mutex_unlock(&decomp_param[idx].mutex);
2344 2345 2346 2347 2348
                break;
            }
        }
        if (idx < thread_count) {
            break;
2349 2350
        } else {
            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2351 2352
        }
    }
2353
    qemu_mutex_unlock(&decomp_done_lock);
2354 2355
}

2356 2357 2358 2359 2360 2361 2362 2363 2364 2365
/**
 * ram_postcopy_incoming_init: allocate postcopy data structures
 *
 * Returns 0 for success and negative if there was one error
 *
 * @mis: current migration incoming state
 *
 * Allocate data structures etc needed by incoming migration with
 * postcopy-ram. postcopy-ram's similarly names
 * postcopy_ram_incoming_init does the work.
2366 2367 2368
 */
int ram_postcopy_incoming_init(MigrationIncomingState *mis)
{
2369
    unsigned long ram_pages = last_ram_page();
2370 2371 2372 2373

    return postcopy_ram_incoming_init(mis, ram_pages);
}

2374 2375 2376 2377 2378
/**
 * ram_load_postcopy: load a page in postcopy case
 *
 * Returns 0 for success or -errno in case of error
 *
2379 2380
 * Called in postcopy mode by ram_load().
 * rcu_read_lock is taken prior to this being called.
2381 2382
 *
 * @f: QEMUFile where to send the data
2383 2384 2385 2386 2387
 */
static int ram_load_postcopy(QEMUFile *f)
{
    int flags = 0, ret = 0;
    bool place_needed = false;
2388
    bool matching_page_sizes = false;
2389 2390 2391
    MigrationIncomingState *mis = migration_incoming_get_current();
    /* Temporary page that is later 'placed' */
    void *postcopy_host_page = postcopy_get_tmp_page(mis);
2392
    void *last_host = NULL;
2393
    bool all_zero = false;
2394 2395 2396 2397 2398 2399

    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
        ram_addr_t addr;
        void *host = NULL;
        void *page_buffer = NULL;
        void *place_source = NULL;
2400
        RAMBlock *block = NULL;
2401 2402 2403 2404 2405 2406 2407 2408 2409
        uint8_t ch;

        addr = qemu_get_be64(f);
        flags = addr & ~TARGET_PAGE_MASK;
        addr &= TARGET_PAGE_MASK;

        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
        place_needed = false;
        if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2410
            block = ram_block_from_stream(f, flags);
2411 2412

            host = host_from_ram_block_offset(block, addr);
2413 2414 2415 2416 2417
            if (!host) {
                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
                ret = -EINVAL;
                break;
            }
2418
            matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2419
            /*
2420 2421 2422
             * Postcopy requires that we place whole host pages atomically;
             * these may be huge pages for RAMBlocks that are backed by
             * hugetlbfs.
2423 2424 2425 2426 2427 2428 2429
             * To make it atomic, the data is read into a temporary page
             * that's moved into place later.
             * The migration protocol uses,  possibly smaller, target-pages
             * however the source ensures it always sends all the components
             * of a host page in order.
             */
            page_buffer = postcopy_host_page +
2430
                          ((uintptr_t)host & (block->page_size - 1));
2431
            /* If all TP are zero then we can optimise the place */
2432
            if (!((uintptr_t)host & (block->page_size - 1))) {
2433
                all_zero = true;
2434 2435 2436
            } else {
                /* not the 1st TP within the HP */
                if (host != (last_host + TARGET_PAGE_SIZE)) {
2437
                    error_report("Non-sequential target page %p/%p",
2438 2439 2440 2441
                                  host, last_host);
                    ret = -EINVAL;
                    break;
                }
2442 2443
            }

2444

2445 2446 2447 2448 2449
            /*
             * If it's the last part of a host page then we place the host
             * page
             */
            place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2450
                                     (block->page_size - 1)) == 0;
2451 2452
            place_source = postcopy_host_page;
        }
2453
        last_host = host;
2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487

        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
        case RAM_SAVE_FLAG_COMPRESS:
            ch = qemu_get_byte(f);
            memset(page_buffer, ch, TARGET_PAGE_SIZE);
            if (ch) {
                all_zero = false;
            }
            break;

        case RAM_SAVE_FLAG_PAGE:
            all_zero = false;
            if (!place_needed || !matching_page_sizes) {
                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
            } else {
                /* Avoids the qemu_file copy during postcopy, which is
                 * going to do a copy later; can only do it when we
                 * do this read in one go (matching page sizes)
                 */
                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
                                         TARGET_PAGE_SIZE);
            }
            break;
        case RAM_SAVE_FLAG_EOS:
            /* normal exit */
            break;
        default:
            error_report("Unknown combination of migration flags: %#x"
                         " (postcopy mode)", flags);
            ret = -EINVAL;
        }

        if (place_needed) {
            /* This gets called at the last target page in the host page */
2488 2489
            void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;

2490
            if (all_zero) {
2491 2492
                ret = postcopy_place_page_zero(mis, place_dest,
                                               block->page_size);
2493
            } else {
2494 2495
                ret = postcopy_place_page(mis, place_dest,
                                          place_source, block->page_size);
2496 2497 2498 2499 2500 2501 2502 2503 2504 2505
            }
        }
        if (!ret) {
            ret = qemu_file_get_error(f);
        }
    }

    return ret;
}

2506 2507 2508 2509 2510
static int ram_load(QEMUFile *f, void *opaque, int version_id)
{
    int flags = 0, ret = 0;
    static uint64_t seq_iter;
    int len = 0;
2511 2512 2513 2514 2515
    /*
     * If system is running in postcopy mode, page inserts to host memory must
     * be atomic
     */
    bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2516 2517
    /* ADVISE is earlier, it shows the source has the postcopy capability on */
    bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530

    seq_iter++;

    if (version_id != 4) {
        ret = -EINVAL;
    }

    /* This RCU critical section can be very long running.
     * When RCU reclaims in the code start to become numerous,
     * it will be necessary to reduce the granularity of this
     * critical section.
     */
    rcu_read_lock();
2531 2532 2533 2534 2535 2536

    if (postcopy_running) {
        ret = ram_load_postcopy(f);
    }

    while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2537
        ram_addr_t addr, total_ram_bytes;
2538
        void *host = NULL;
2539 2540 2541 2542 2543 2544
        uint8_t ch;

        addr = qemu_get_be64(f);
        flags = addr & ~TARGET_PAGE_MASK;
        addr &= TARGET_PAGE_MASK;

2545 2546
        if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2547 2548 2549
            RAMBlock *block = ram_block_from_stream(f, flags);

            host = host_from_ram_block_offset(block, addr);
2550 2551 2552 2553 2554
            if (!host) {
                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
                ret = -EINVAL;
                break;
            }
2555
            trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2556 2557
        }

2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571
        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
        case RAM_SAVE_FLAG_MEM_SIZE:
            /* Synchronize RAM block list */
            total_ram_bytes = addr;
            while (!ret && total_ram_bytes) {
                RAMBlock *block;
                char id[256];
                ram_addr_t length;

                len = qemu_get_byte(f);
                qemu_get_buffer(f, (uint8_t *)id, len);
                id[len] = 0;
                length = qemu_get_be64(f);

D
Dr. David Alan Gilbert 已提交
2572 2573 2574 2575
                block = qemu_ram_block_by_name(id);
                if (block) {
                    if (length != block->used_length) {
                        Error *local_err = NULL;
2576

G
Gonglei 已提交
2577
                        ret = qemu_ram_resize(block, length,
D
Dr. David Alan Gilbert 已提交
2578 2579 2580
                                              &local_err);
                        if (local_err) {
                            error_report_err(local_err);
2581 2582
                        }
                    }
2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594
                    /* For postcopy we need to check hugepage sizes match */
                    if (postcopy_advised &&
                        block->page_size != qemu_host_page_size) {
                        uint64_t remote_page_size = qemu_get_be64(f);
                        if (remote_page_size != block->page_size) {
                            error_report("Mismatched RAM page size %s "
                                         "(local) %zd != %" PRId64,
                                         id, block->page_size,
                                         remote_page_size);
                            ret = -EINVAL;
                        }
                    }
D
Dr. David Alan Gilbert 已提交
2595 2596 2597
                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
                                          block->idstr);
                } else {
2598 2599 2600 2601 2602 2603 2604 2605
                    error_report("Unknown ramblock \"%s\", cannot "
                                 "accept migration", id);
                    ret = -EINVAL;
                }

                total_ram_bytes -= length;
            }
            break;
2606

2607 2608 2609 2610
        case RAM_SAVE_FLAG_COMPRESS:
            ch = qemu_get_byte(f);
            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
            break;
2611

2612 2613 2614 2615
        case RAM_SAVE_FLAG_PAGE:
            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
            break;

2616
        case RAM_SAVE_FLAG_COMPRESS_PAGE:
2617 2618 2619 2620 2621 2622
            len = qemu_get_be32(f);
            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
                error_report("Invalid compressed data length: %d", len);
                ret = -EINVAL;
                break;
            }
2623
            decompress_data_with_multi_threads(f, host, len);
2624
            break;
2625

2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638
        case RAM_SAVE_FLAG_XBZRLE:
            if (load_xbzrle(f, addr, host) < 0) {
                error_report("Failed to decompress XBZRLE page at "
                             RAM_ADDR_FMT, addr);
                ret = -EINVAL;
                break;
            }
            break;
        case RAM_SAVE_FLAG_EOS:
            /* normal exit */
            break;
        default:
            if (flags & RAM_SAVE_FLAG_HOOK) {
2639
                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650
            } else {
                error_report("Unknown combination of migration flags: %#x",
                             flags);
                ret = -EINVAL;
            }
        }
        if (!ret) {
            ret = qemu_file_get_error(f);
        }
    }

L
Liang Li 已提交
2651
    wait_for_decompress_done();
2652
    rcu_read_unlock();
2653
    trace_ram_load_complete(ret, seq_iter);
2654 2655 2656 2657 2658 2659
    return ret;
}

static SaveVMHandlers savevm_ram_handlers = {
    .save_live_setup = ram_save_setup,
    .save_live_iterate = ram_save_iterate,
2660
    .save_live_complete_postcopy = ram_save_complete,
2661
    .save_live_complete_precopy = ram_save_complete,
2662 2663
    .save_live_pending = ram_save_pending,
    .load_state = ram_load,
L
Liang Li 已提交
2664
    .cleanup = ram_migration_cleanup,
2665 2666 2667 2668 2669
};

void ram_mig_init(void)
{
    qemu_mutex_init(&XBZRLE.lock);
J
Juan Quintela 已提交
2670
    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2671
}