ram.c 87.5 KB
Newer Older
1 2 3 4
/*
 * QEMU System Emulator
 *
 * Copyright (c) 2003-2008 Fabrice Bellard
5 6 7 8
 * Copyright (c) 2011-2015 Red Hat Inc
 *
 * Authors:
 *  Juan Quintela <quintela@redhat.com>
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
P
Peter Maydell 已提交
28
#include "qemu/osdep.h"
29
#include "cpu.h"
30
#include <zlib.h>
31
#include "qapi-event.h"
32
#include "qemu/cutils.h"
33 34
#include "qemu/bitops.h"
#include "qemu/bitmap.h"
J
Juan Quintela 已提交
35
#include "qemu/main-loop.h"
36
#include "xbzrle.h"
37
#include "ram.h"
38
#include "migration.h"
39
#include "migration/register.h"
40
#include "migration/misc.h"
J
Juan Quintela 已提交
41
#include "qemu-file.h"
42
#include "postcopy-ram.h"
43 44
#include "migration/page_cache.h"
#include "qemu/error-report.h"
45
#include "qapi/qmp/qerror.h"
46 47
#include "trace.h"
#include "exec/ram_addr.h"
48
#include "exec/target_page.h"
49
#include "qemu/rcu_queue.h"
50
#include "migration/colo.h"
51
#include "migration/block.h"
52 53 54 55

/***********************************************************/
/* ram save/restore */

56 57 58 59 60 61
/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
 * worked for pages that where filled with the same char.  We switched
 * it to only search for the zero value.  And to avoid confusion with
 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
 */

62
#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
63
#define RAM_SAVE_FLAG_ZERO     0x02
64 65 66 67 68 69 70 71 72 73
#define RAM_SAVE_FLAG_MEM_SIZE 0x04
#define RAM_SAVE_FLAG_PAGE     0x08
#define RAM_SAVE_FLAG_EOS      0x10
#define RAM_SAVE_FLAG_CONTINUE 0x20
#define RAM_SAVE_FLAG_XBZRLE   0x40
/* 0x80 is reserved in migration.h start with 0x100 next */
#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100

static inline bool is_zero_range(uint8_t *p, uint64_t size)
{
74
    return buffer_is_zero(p, size);
75 76
}

77 78
XBZRLECacheStats xbzrle_counters;

79 80 81 82 83 84 85 86 87 88
/* struct contains XBZRLE cache and a static page
   used by the compression */
static struct {
    /* buffer used for XBZRLE encoding */
    uint8_t *encoded_buf;
    /* buffer for storing page content */
    uint8_t *current_buf;
    /* Cache for XBZRLE, Protected by lock. */
    PageCache *cache;
    QemuMutex lock;
89 90
    /* it will store a page full of zeros */
    uint8_t *zero_target_page;
91 92
    /* buffer used for XBZRLE decoding */
    uint8_t *decoded_buf;
93 94 95 96 97 98 99 100 101 102 103 104 105 106
} XBZRLE;

static void XBZRLE_cache_lock(void)
{
    if (migrate_use_xbzrle())
        qemu_mutex_lock(&XBZRLE.lock);
}

static void XBZRLE_cache_unlock(void)
{
    if (migrate_use_xbzrle())
        qemu_mutex_unlock(&XBZRLE.lock);
}

107 108 109 110 111 112 113 114
/**
 * xbzrle_cache_resize: resize the xbzrle cache
 *
 * This function is called from qmp_migrate_set_cache_size in main
 * thread, possibly while a migration is in progress.  A running
 * migration may be using the cache and might finish during this call,
 * hence changes to the cache are protected by XBZRLE.lock().
 *
115
 * Returns 0 for success or -1 for error
116 117
 *
 * @new_size: new cache size
118
 * @errp: set *errp if the check failed, with reason
119
 */
120
int xbzrle_cache_resize(int64_t new_size, Error **errp)
121 122
{
    PageCache *new_cache;
123
    int64_t ret = 0;
124

125 126 127 128 129 130 131
    /* Check for truncation */
    if (new_size != (size_t)new_size) {
        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
                   "exceeding address space");
        return -1;
    }

132 133
    if (new_size == migrate_xbzrle_cache_size()) {
        /* nothing to do */
134
        return 0;
135 136
    }

137 138 139
    XBZRLE_cache_lock();

    if (XBZRLE.cache != NULL) {
140
        new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
141 142 143 144 145 146 147 148 149 150 151 152 153
        if (!new_cache) {
            ret = -1;
            goto out;
        }

        cache_fini(XBZRLE.cache);
        XBZRLE.cache = new_cache;
    }
out:
    XBZRLE_cache_unlock();
    return ret;
}

154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
static void ramblock_recv_map_init(void)
{
    RAMBlock *rb;

    RAMBLOCK_FOREACH(rb) {
        assert(!rb->receivedmap);
        rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
    }
}

int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
{
    return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
                    rb->receivedmap);
}

void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
{
    set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
}

void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
                                    size_t nr)
{
    bitmap_set_atomic(rb->receivedmap,
                      ramblock_recv_bitmap_offset(host_addr, rb),
                      nr);
}

183 184 185 186 187 188 189 190 191 192 193 194
/*
 * An outstanding page request, on the source, having been received
 * and queued
 */
struct RAMSrcPageRequest {
    RAMBlock *rb;
    hwaddr    offset;
    hwaddr    len;

    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
};

J
Juan Quintela 已提交
195 196
/* State of RAM for migration */
struct RAMState {
J
Juan Quintela 已提交
197 198
    /* QEMUFile used for this migration */
    QEMUFile *f;
J
Juan Quintela 已提交
199 200 201 202
    /* Last block that we have visited searching for dirty pages */
    RAMBlock *last_seen_block;
    /* Last block from where we have sent data */
    RAMBlock *last_sent_block;
203 204
    /* Last dirty target page we have sent */
    ram_addr_t last_page;
J
Juan Quintela 已提交
205 206 207 208
    /* last ram version we have seen */
    uint32_t last_version;
    /* We are in the first round */
    bool ram_bulk_stage;
209 210
    /* How many times we have dirty too many pages */
    int dirty_rate_high_cnt;
211 212 213
    /* these variables are used for bitmap sync */
    /* last time we did a full bitmap_sync */
    int64_t time_last_bitmap_sync;
214
    /* bytes transferred at start_time */
215
    uint64_t bytes_xfer_prev;
216
    /* number of dirty pages since start_time */
217
    uint64_t num_dirty_pages_period;
218 219
    /* xbzrle misses since the beginning of the period */
    uint64_t xbzrle_cache_miss_prev;
220 221
    /* number of iterations at the beginning of period */
    uint64_t iterations_prev;
222 223
    /* Iterations since start */
    uint64_t iterations;
224
    /* number of dirty bits in the bitmap */
225 226
    uint64_t migration_dirty_pages;
    /* protects modification of the bitmap */
227
    QemuMutex bitmap_mutex;
228 229
    /* The RAMBlock used in the last src_page_requests */
    RAMBlock *last_req_rb;
230 231 232
    /* Queue of outstanding page requests from the destination */
    QemuMutex src_page_req_mutex;
    QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
J
Juan Quintela 已提交
233 234 235
};
typedef struct RAMState RAMState;

J
Juan Quintela 已提交
236
static RAMState *ram_state;
J
Juan Quintela 已提交
237

J
Juan Quintela 已提交
238
uint64_t ram_bytes_remaining(void)
239
{
J
Juan Quintela 已提交
240
    return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
241 242
}

243
MigrationStats ram_counters;
244

245 246 247 248
/* used by the search for pages to send */
struct PageSearchStatus {
    /* Current block being searched */
    RAMBlock    *block;
249 250
    /* Current page to search from */
    unsigned long page;
251 252 253 254 255
    /* Set once we wrap around */
    bool         complete_round;
};
typedef struct PageSearchStatus PageSearchStatus;

256 257
struct CompressParam {
    bool done;
258
    bool quit;
259 260 261 262 263 264 265 266 267
    QEMUFile *file;
    QemuMutex mutex;
    QemuCond cond;
    RAMBlock *block;
    ram_addr_t offset;
};
typedef struct CompressParam CompressParam;

struct DecompressParam {
268
    bool done;
269
    bool quit;
270 271 272
    QemuMutex mutex;
    QemuCond cond;
    void *des;
273
    uint8_t *compbuf;
274 275 276 277 278 279 280 281 282 283
    int len;
};
typedef struct DecompressParam DecompressParam;

static CompressParam *comp_param;
static QemuThread *compress_threads;
/* comp_done_cond is used to wake up the migration thread when
 * one of the compression threads has finished the compression.
 * comp_done_lock is used to co-work with comp_done_cond.
 */
L
Liang Li 已提交
284 285
static QemuMutex comp_done_lock;
static QemuCond comp_done_cond;
286 287 288 289 290
/* The empty QEMUFileOps will be used by file in CompressParam */
static const QEMUFileOps empty_ops = { };

static DecompressParam *decomp_param;
static QemuThread *decompress_threads;
291 292
static QemuMutex decomp_done_lock;
static QemuCond decomp_done_cond;
293

294 295
static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
                                ram_addr_t offset);
296 297 298 299

static void *do_data_compress(void *opaque)
{
    CompressParam *param = opaque;
300 301
    RAMBlock *block;
    ram_addr_t offset;
302

303
    qemu_mutex_lock(&param->mutex);
304
    while (!param->quit) {
305 306 307 308 309 310 311 312
        if (param->block) {
            block = param->block;
            offset = param->offset;
            param->block = NULL;
            qemu_mutex_unlock(&param->mutex);

            do_compress_ram_page(param->file, block, offset);

L
Liang Li 已提交
313
            qemu_mutex_lock(&comp_done_lock);
314
            param->done = true;
L
Liang Li 已提交
315 316
            qemu_cond_signal(&comp_done_cond);
            qemu_mutex_unlock(&comp_done_lock);
317 318 319

            qemu_mutex_lock(&param->mutex);
        } else {
320 321 322
            qemu_cond_wait(&param->cond, &param->mutex);
        }
    }
323
    qemu_mutex_unlock(&param->mutex);
324 325 326 327 328 329 330 331 332

    return NULL;
}

static inline void terminate_compression_threads(void)
{
    int idx, thread_count;

    thread_count = migrate_compress_threads();
333

334 335
    for (idx = 0; idx < thread_count; idx++) {
        qemu_mutex_lock(&comp_param[idx].mutex);
336
        comp_param[idx].quit = true;
337 338 339 340 341
        qemu_cond_signal(&comp_param[idx].cond);
        qemu_mutex_unlock(&comp_param[idx].mutex);
    }
}

342
static void compress_threads_save_cleanup(void)
343 344 345 346 347 348 349 350 351 352 353 354 355 356
{
    int i, thread_count;

    if (!migrate_use_compression()) {
        return;
    }
    terminate_compression_threads();
    thread_count = migrate_compress_threads();
    for (i = 0; i < thread_count; i++) {
        qemu_thread_join(compress_threads + i);
        qemu_fclose(comp_param[i].file);
        qemu_mutex_destroy(&comp_param[i].mutex);
        qemu_cond_destroy(&comp_param[i].cond);
    }
L
Liang Li 已提交
357 358
    qemu_mutex_destroy(&comp_done_lock);
    qemu_cond_destroy(&comp_done_cond);
359 360 361 362 363 364
    g_free(compress_threads);
    g_free(comp_param);
    compress_threads = NULL;
    comp_param = NULL;
}

365
static void compress_threads_save_setup(void)
366 367 368 369 370 371 372 373 374
{
    int i, thread_count;

    if (!migrate_use_compression()) {
        return;
    }
    thread_count = migrate_compress_threads();
    compress_threads = g_new0(QemuThread, thread_count);
    comp_param = g_new0(CompressParam, thread_count);
L
Liang Li 已提交
375 376
    qemu_cond_init(&comp_done_cond);
    qemu_mutex_init(&comp_done_lock);
377
    for (i = 0; i < thread_count; i++) {
C
Cao jin 已提交
378 379
        /* comp_param[i].file is just used as a dummy buffer to save data,
         * set its ops to empty.
380 381 382
         */
        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
        comp_param[i].done = true;
383
        comp_param[i].quit = false;
384 385 386 387 388 389 390 391
        qemu_mutex_init(&comp_param[i].mutex);
        qemu_cond_init(&comp_param[i].cond);
        qemu_thread_create(compress_threads + i, "compress",
                           do_data_compress, comp_param + i,
                           QEMU_THREAD_JOINABLE);
    }
}

392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593
/* Multiple fd's */

struct MultiFDSendParams {
    uint8_t id;
    char *name;
    QemuThread thread;
    QemuSemaphore sem;
    QemuMutex mutex;
    bool quit;
};
typedef struct MultiFDSendParams MultiFDSendParams;

struct {
    MultiFDSendParams *params;
    /* number of created threads */
    int count;
} *multifd_send_state;

static void terminate_multifd_send_threads(Error *errp)
{
    int i;

    for (i = 0; i < multifd_send_state->count; i++) {
        MultiFDSendParams *p = &multifd_send_state->params[i];

        qemu_mutex_lock(&p->mutex);
        p->quit = true;
        qemu_sem_post(&p->sem);
        qemu_mutex_unlock(&p->mutex);
    }
}

int multifd_save_cleanup(Error **errp)
{
    int i;
    int ret = 0;

    if (!migrate_use_multifd()) {
        return 0;
    }
    terminate_multifd_send_threads(NULL);
    for (i = 0; i < multifd_send_state->count; i++) {
        MultiFDSendParams *p = &multifd_send_state->params[i];

        qemu_thread_join(&p->thread);
        qemu_mutex_destroy(&p->mutex);
        qemu_sem_destroy(&p->sem);
        g_free(p->name);
        p->name = NULL;
    }
    g_free(multifd_send_state->params);
    multifd_send_state->params = NULL;
    g_free(multifd_send_state);
    multifd_send_state = NULL;
    return ret;
}

static void *multifd_send_thread(void *opaque)
{
    MultiFDSendParams *p = opaque;

    while (true) {
        qemu_mutex_lock(&p->mutex);
        if (p->quit) {
            qemu_mutex_unlock(&p->mutex);
            break;
        }
        qemu_mutex_unlock(&p->mutex);
        qemu_sem_wait(&p->sem);
    }

    return NULL;
}

int multifd_save_setup(void)
{
    int thread_count;
    uint8_t i;

    if (!migrate_use_multifd()) {
        return 0;
    }
    thread_count = migrate_multifd_channels();
    multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
    multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
    multifd_send_state->count = 0;
    for (i = 0; i < thread_count; i++) {
        MultiFDSendParams *p = &multifd_send_state->params[i];

        qemu_mutex_init(&p->mutex);
        qemu_sem_init(&p->sem, 0);
        p->quit = false;
        p->id = i;
        p->name = g_strdup_printf("multifdsend_%d", i);
        qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
                           QEMU_THREAD_JOINABLE);

        multifd_send_state->count++;
    }
    return 0;
}

struct MultiFDRecvParams {
    uint8_t id;
    char *name;
    QemuThread thread;
    QemuSemaphore sem;
    QemuMutex mutex;
    bool quit;
};
typedef struct MultiFDRecvParams MultiFDRecvParams;

struct {
    MultiFDRecvParams *params;
    /* number of created threads */
    int count;
} *multifd_recv_state;

static void terminate_multifd_recv_threads(Error *errp)
{
    int i;

    for (i = 0; i < multifd_recv_state->count; i++) {
        MultiFDRecvParams *p = &multifd_recv_state->params[i];

        qemu_mutex_lock(&p->mutex);
        p->quit = true;
        qemu_sem_post(&p->sem);
        qemu_mutex_unlock(&p->mutex);
    }
}

int multifd_load_cleanup(Error **errp)
{
    int i;
    int ret = 0;

    if (!migrate_use_multifd()) {
        return 0;
    }
    terminate_multifd_recv_threads(NULL);
    for (i = 0; i < multifd_recv_state->count; i++) {
        MultiFDRecvParams *p = &multifd_recv_state->params[i];

        qemu_thread_join(&p->thread);
        qemu_mutex_destroy(&p->mutex);
        qemu_sem_destroy(&p->sem);
        g_free(p->name);
        p->name = NULL;
    }
    g_free(multifd_recv_state->params);
    multifd_recv_state->params = NULL;
    g_free(multifd_recv_state);
    multifd_recv_state = NULL;

    return ret;
}

static void *multifd_recv_thread(void *opaque)
{
    MultiFDRecvParams *p = opaque;

    while (true) {
        qemu_mutex_lock(&p->mutex);
        if (p->quit) {
            qemu_mutex_unlock(&p->mutex);
            break;
        }
        qemu_mutex_unlock(&p->mutex);
        qemu_sem_wait(&p->sem);
    }

    return NULL;
}

int multifd_load_setup(void)
{
    int thread_count;
    uint8_t i;

    if (!migrate_use_multifd()) {
        return 0;
    }
    thread_count = migrate_multifd_channels();
    multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
    multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
    multifd_recv_state->count = 0;
    for (i = 0; i < thread_count; i++) {
        MultiFDRecvParams *p = &multifd_recv_state->params[i];

        qemu_mutex_init(&p->mutex);
        qemu_sem_init(&p->sem, 0);
        p->quit = false;
        p->id = i;
        p->name = g_strdup_printf("multifdrecv_%d", i);
        qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
                           QEMU_THREAD_JOINABLE);
        multifd_recv_state->count++;
    }
    return 0;
}

594
/**
595
 * save_page_header: write page header to wire
596 597 598
 *
 * If this is the 1st block, it also writes the block identification
 *
599
 * Returns the number of bytes written
600 601 602 603 604 605
 *
 * @f: QEMUFile where to send the data
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 *          in the lower bits, it contains flags
 */
606 607
static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
                               ram_addr_t offset)
608
{
609
    size_t size, len;
610

J
Juan Quintela 已提交
611 612 613
    if (block == rs->last_sent_block) {
        offset |= RAM_SAVE_FLAG_CONTINUE;
    }
614
    qemu_put_be64(f, offset);
615 616 617
    size = 8;

    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
618
        len = strlen(block->idstr);
619 620
        qemu_put_byte(f, len);
        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
621
        size += 1 + len;
J
Juan Quintela 已提交
622
        rs->last_sent_block = block;
623 624 625 626
    }
    return size;
}

627 628 629 630 631 632 633 634
/**
 * mig_throttle_guest_down: throotle down the guest
 *
 * Reduce amount of guest cpu execution to hopefully slow down memory
 * writes. If guest dirty memory rate is reduced below the rate at
 * which we can transfer pages to the destination then we should be
 * able to complete migration. Some workloads dirty memory way too
 * fast and will not effectively converge, even with auto-converge.
635 636 637 638
 */
static void mig_throttle_guest_down(void)
{
    MigrationState *s = migrate_get_current();
639 640
    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
    uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
641 642 643 644 645 646 647 648 649 650

    /* We have not started throttling yet. Let's start it. */
    if (!cpu_throttle_active()) {
        cpu_throttle_set(pct_initial);
    } else {
        /* Throttling already on, just increase the rate */
        cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
    }
}

651 652 653
/**
 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 *
J
Juan Quintela 已提交
654
 * @rs: current RAM state
655 656 657
 * @current_addr: address for the zero page
 *
 * Update the xbzrle cache to reflect a page that's been sent as all 0.
658 659 660
 * The important thing is that a stale (not-yet-0'd) page be replaced
 * by the new data.
 * As a bonus, if the page wasn't in the cache it gets added so that
661
 * when a small write is made into the 0'd page it gets XBZRLE sent.
662
 */
J
Juan Quintela 已提交
663
static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
664
{
J
Juan Quintela 已提交
665
    if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
666 667 668 669 670
        return;
    }

    /* We don't care if this fails to allocate a new cache page
     * as long as it updated an old one */
671
    cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
672
                 ram_counters.dirty_sync_count);
673 674 675 676 677 678 679 680 681 682 683
}

#define ENCODING_FLAG_XBZRLE 0x1

/**
 * save_xbzrle_page: compress and send current page
 *
 * Returns: 1 means that we wrote the page
 *          0 means that page is identical to the one already sent
 *          -1 means that xbzrle would be longer than normal
 *
684
 * @rs: current RAM state
685 686
 * @current_data: pointer to the address of the page contents
 * @current_addr: addr of the page
687 688 689 690
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 * @last_stage: if we are at the completion stage
 */
J
Juan Quintela 已提交
691
static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
692
                            ram_addr_t current_addr, RAMBlock *block,
693
                            ram_addr_t offset, bool last_stage)
694 695 696 697
{
    int encoded_len = 0, bytes_xbzrle;
    uint8_t *prev_cached_page;

698 699 700
    if (!cache_is_cached(XBZRLE.cache, current_addr,
                         ram_counters.dirty_sync_count)) {
        xbzrle_counters.cache_miss++;
701 702
        if (!last_stage) {
            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
703
                             ram_counters.dirty_sync_count) == -1) {
704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723
                return -1;
            } else {
                /* update *current_data when the page has been
                   inserted into cache */
                *current_data = get_cached_data(XBZRLE.cache, current_addr);
            }
        }
        return -1;
    }

    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);

    /* save current buffer into memory */
    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);

    /* XBZRLE encoding (if there is no overflow) */
    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
                                       TARGET_PAGE_SIZE);
    if (encoded_len == 0) {
724
        trace_save_xbzrle_page_skipping();
725 726
        return 0;
    } else if (encoded_len == -1) {
727
        trace_save_xbzrle_page_overflow();
728
        xbzrle_counters.overflow++;
729 730 731 732 733 734 735 736 737 738 739 740 741 742
        /* update data in the cache */
        if (!last_stage) {
            memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
            *current_data = prev_cached_page;
        }
        return -1;
    }

    /* we need to update the data in the cache, in order to get the same data */
    if (!last_stage) {
        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
    }

    /* Send XBZRLE based compressed page */
743
    bytes_xbzrle = save_page_header(rs, rs->f, block,
J
Juan Quintela 已提交
744 745 746 747
                                    offset | RAM_SAVE_FLAG_XBZRLE);
    qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
    qemu_put_be16(rs->f, encoded_len);
    qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
748
    bytes_xbzrle += encoded_len + 1 + 2;
749 750 751
    xbzrle_counters.pages++;
    xbzrle_counters.bytes += bytes_xbzrle;
    ram_counters.transferred += bytes_xbzrle;
752 753 754 755

    return 1;
}

756 757
/**
 * migration_bitmap_find_dirty: find the next dirty page from start
758
 *
759 760 761 762
 * Called with rcu_read_lock() to protect migration_bitmap
 *
 * Returns the byte offset within memory region of the start of a dirty page
 *
J
Juan Quintela 已提交
763
 * @rs: current RAM state
764
 * @rb: RAMBlock where to search for dirty pages
765
 * @start: page where we start the search
766
 */
767
static inline
768
unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
769
                                          unsigned long start)
770
{
771 772
    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
    unsigned long *bitmap = rb->bmap;
773 774
    unsigned long next;

775 776
    if (rs->ram_bulk_stage && start > 0) {
        next = start + 1;
777
    } else {
778
        next = find_next_bit(bitmap, size, start);
779 780
    }

781
    return next;
782 783
}

784
static inline bool migration_bitmap_clear_dirty(RAMState *rs,
785 786
                                                RAMBlock *rb,
                                                unsigned long page)
787 788 789
{
    bool ret;

790
    ret = test_and_clear_bit(page, rb->bmap);
791 792

    if (ret) {
793
        rs->migration_dirty_pages--;
794 795 796 797
    }
    return ret;
}

798 799
static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
                                        ram_addr_t start, ram_addr_t length)
800
{
801
    rs->migration_dirty_pages +=
802
        cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
803
                                              &rs->num_dirty_pages_period);
804 805
}

806 807 808 809 810 811 812 813
/**
 * ram_pagesize_summary: calculate all the pagesizes of a VM
 *
 * Returns a summary bitmap of the page sizes of all RAMBlocks
 *
 * For VMs with just normal pages this is equivalent to the host page
 * size. If it's got some huge pages then it's the OR of all the
 * different page sizes.
814 815 816 817 818 819
 */
uint64_t ram_pagesize_summary(void)
{
    RAMBlock *block;
    uint64_t summary = 0;

P
Peter Xu 已提交
820
    RAMBLOCK_FOREACH(block) {
821 822 823 824 825 826
        summary |= block->page_size;
    }

    return summary;
}

827
static void migration_bitmap_sync(RAMState *rs)
828 829 830
{
    RAMBlock *block;
    int64_t end_time;
831
    uint64_t bytes_xfer_now;
832

833
    ram_counters.dirty_sync_count++;
834

835 836
    if (!rs->time_last_bitmap_sync) {
        rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
837 838 839
    }

    trace_migration_bitmap_sync_start();
840
    memory_global_dirty_log_sync();
841

842
    qemu_mutex_lock(&rs->bitmap_mutex);
843
    rcu_read_lock();
P
Peter Xu 已提交
844
    RAMBLOCK_FOREACH(block) {
845
        migration_bitmap_sync_range(rs, block, 0, block->used_length);
846 847
    }
    rcu_read_unlock();
848
    qemu_mutex_unlock(&rs->bitmap_mutex);
849

850
    trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
851

852 853 854
    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);

    /* more than 1 second = 1000 millisecons */
855
    if (end_time > rs->time_last_bitmap_sync + 1000) {
856
        /* calculate period counters */
857
        ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
858
            / (end_time - rs->time_last_bitmap_sync);
859
        bytes_xfer_now = ram_counters.transferred;
860

861 862 863 864
        /* During block migration the auto-converge logic incorrectly detects
         * that ram migration makes no progress. Avoid this by disabling the
         * throttling logic during the bulk phase of block migration. */
        if (migrate_auto_converge() && !blk_mig_bulk_active()) {
865 866 867
            /* The following detection logic can be refined later. For now:
               Check to see if the dirtied bytes is 50% more than the approx.
               amount of bytes that just got transferred since the last time we
868 869 870
               were in this routine. If that happens twice, start or increase
               throttling */

871
            if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
872
                   (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
873
                (++rs->dirty_rate_high_cnt >= 2)) {
874
                    trace_migration_throttle();
875
                    rs->dirty_rate_high_cnt = 0;
876
                    mig_throttle_guest_down();
877
            }
878
        }
879

880
        if (migrate_use_xbzrle()) {
881
            if (rs->iterations_prev != rs->iterations) {
882 883
                xbzrle_counters.cache_miss_rate =
                   (double)(xbzrle_counters.cache_miss -
884
                            rs->xbzrle_cache_miss_prev) /
885
                   (rs->iterations - rs->iterations_prev);
886
            }
887
            rs->iterations_prev = rs->iterations;
888
            rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
889
        }
890 891

        /* reset period counters */
892
        rs->time_last_bitmap_sync = end_time;
893
        rs->num_dirty_pages_period = 0;
894
        rs->bytes_xfer_prev = bytes_xfer_now;
895
    }
896
    if (migrate_use_events()) {
897
        qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
898
    }
899 900 901
}

/**
902
 * save_zero_page: send the zero page to the stream
903
 *
904
 * Returns the number of pages written.
905
 *
906
 * @rs: current RAM state
907 908 909 910
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 * @p: pointer to the page
 */
J
Juan Quintela 已提交
911 912
static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
                          uint8_t *p)
913 914 915 916
{
    int pages = -1;

    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
917 918
        ram_counters.duplicate++;
        ram_counters.transferred +=
919
            save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
J
Juan Quintela 已提交
920
        qemu_put_byte(rs->f, 0);
921
        ram_counters.transferred += 1;
922 923 924 925 926 927
        pages = 1;
    }

    return pages;
}

928
static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
929
{
930
    if (!migrate_release_ram() || !migration_in_postcopy()) {
931 932 933
        return;
    }

934
    ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
935 936
}

937
/**
938
 * ram_save_page: send the given page to the stream
939
 *
940
 * Returns the number of pages written.
941 942 943
 *          < 0 - error
 *          >=0 - Number of pages written - this might legally be 0
 *                if xbzrle noticed the page was the same.
944
 *
J
Juan Quintela 已提交
945
 * @rs: current RAM state
946 947 948 949
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 * @last_stage: if we are at the completion stage
 */
950
static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
951 952 953 954 955 956 957
{
    int pages = -1;
    uint64_t bytes_xmit;
    ram_addr_t current_addr;
    uint8_t *p;
    int ret;
    bool send_async = true;
958
    RAMBlock *block = pss->block;
959
    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
960

961
    p = block->host + offset;
962
    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
963 964 965

    /* In doubt sent page as normal */
    bytes_xmit = 0;
J
Juan Quintela 已提交
966
    ret = ram_control_save_page(rs->f, block->offset,
967 968
                           offset, TARGET_PAGE_SIZE, &bytes_xmit);
    if (bytes_xmit) {
969
        ram_counters.transferred += bytes_xmit;
970 971 972 973 974 975 976 977 978 979
        pages = 1;
    }

    XBZRLE_cache_lock();

    current_addr = block->offset + offset;

    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
        if (ret != RAM_SAVE_CONTROL_DELAYED) {
            if (bytes_xmit > 0) {
980
                ram_counters.normal++;
981
            } else if (bytes_xmit == 0) {
982
                ram_counters.duplicate++;
983 984 985
            }
        }
    } else {
J
Juan Quintela 已提交
986
        pages = save_zero_page(rs, block, offset, p);
987 988 989 990
        if (pages > 0) {
            /* Must let xbzrle know, otherwise a previous (now 0'd) cached
             * page would be stale
             */
J
Juan Quintela 已提交
991
            xbzrle_cache_zero_page(rs, current_addr);
992
            ram_release_pages(block->idstr, offset, pages);
J
Juan Quintela 已提交
993
        } else if (!rs->ram_bulk_stage &&
994
                   !migration_in_postcopy() && migrate_use_xbzrle()) {
J
Juan Quintela 已提交
995
            pages = save_xbzrle_page(rs, &p, current_addr, block,
996
                                     offset, last_stage);
997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
            if (!last_stage) {
                /* Can't send this cached data async, since the cache page
                 * might get updated before it gets to the wire
                 */
                send_async = false;
            }
        }
    }

    /* XBZRLE overflow or normal page */
    if (pages == -1) {
1008 1009
        ram_counters.transferred +=
            save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1010
        if (send_async) {
J
Juan Quintela 已提交
1011
            qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1012
                                  migrate_release_ram() &
1013
                                  migration_in_postcopy());
1014
        } else {
J
Juan Quintela 已提交
1015
            qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1016
        }
1017
        ram_counters.transferred += TARGET_PAGE_SIZE;
1018
        pages = 1;
1019
        ram_counters.normal++;
1020 1021 1022 1023 1024 1025 1026
    }

    XBZRLE_cache_unlock();

    return pages;
}

1027 1028
static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
                                ram_addr_t offset)
1029
{
J
Juan Quintela 已提交
1030
    RAMState *rs = ram_state;
1031
    int bytes_sent, blen;
1032
    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1033

1034
    bytes_sent = save_page_header(rs, f, block, offset |
1035
                                  RAM_SAVE_FLAG_COMPRESS_PAGE);
1036
    blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1037
                                     migrate_compress_level());
1038 1039 1040 1041 1042 1043
    if (blen < 0) {
        bytes_sent = 0;
        qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
        error_report("compressed data failed!");
    } else {
        bytes_sent += blen;
1044
        ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1045
    }
1046 1047 1048 1049

    return bytes_sent;
}

J
Juan Quintela 已提交
1050
static void flush_compressed_data(RAMState *rs)
1051 1052 1053 1054 1055 1056 1057
{
    int idx, len, thread_count;

    if (!migrate_use_compression()) {
        return;
    }
    thread_count = migrate_compress_threads();
1058

L
Liang Li 已提交
1059
    qemu_mutex_lock(&comp_done_lock);
1060
    for (idx = 0; idx < thread_count; idx++) {
1061
        while (!comp_param[idx].done) {
L
Liang Li 已提交
1062
            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1063
        }
1064
    }
L
Liang Li 已提交
1065
    qemu_mutex_unlock(&comp_done_lock);
1066 1067 1068

    for (idx = 0; idx < thread_count; idx++) {
        qemu_mutex_lock(&comp_param[idx].mutex);
1069
        if (!comp_param[idx].quit) {
J
Juan Quintela 已提交
1070
            len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1071
            ram_counters.transferred += len;
1072
        }
1073
        qemu_mutex_unlock(&comp_param[idx].mutex);
1074 1075 1076 1077 1078 1079 1080 1081 1082 1083
    }
}

static inline void set_compress_params(CompressParam *param, RAMBlock *block,
                                       ram_addr_t offset)
{
    param->block = block;
    param->offset = offset;
}

J
Juan Quintela 已提交
1084 1085
static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
                                           ram_addr_t offset)
1086 1087 1088 1089
{
    int idx, thread_count, bytes_xmit = -1, pages = -1;

    thread_count = migrate_compress_threads();
L
Liang Li 已提交
1090
    qemu_mutex_lock(&comp_done_lock);
1091 1092 1093
    while (true) {
        for (idx = 0; idx < thread_count; idx++) {
            if (comp_param[idx].done) {
1094
                comp_param[idx].done = false;
J
Juan Quintela 已提交
1095
                bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1096
                qemu_mutex_lock(&comp_param[idx].mutex);
1097
                set_compress_params(&comp_param[idx], block, offset);
1098 1099
                qemu_cond_signal(&comp_param[idx].cond);
                qemu_mutex_unlock(&comp_param[idx].mutex);
1100
                pages = 1;
1101 1102
                ram_counters.normal++;
                ram_counters.transferred += bytes_xmit;
1103 1104 1105 1106 1107 1108
                break;
            }
        }
        if (pages > 0) {
            break;
        } else {
L
Liang Li 已提交
1109
            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1110 1111
        }
    }
L
Liang Li 已提交
1112
    qemu_mutex_unlock(&comp_done_lock);
1113 1114 1115 1116 1117 1118 1119

    return pages;
}

/**
 * ram_save_compressed_page: compress the given page and send it to the stream
 *
1120
 * Returns the number of pages written.
1121
 *
J
Juan Quintela 已提交
1122
 * @rs: current RAM state
1123 1124 1125 1126
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 * @last_stage: if we are at the completion stage
 */
1127 1128
static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
                                    bool last_stage)
1129 1130
{
    int pages = -1;
1131
    uint64_t bytes_xmit = 0;
1132
    uint8_t *p;
1133
    int ret, blen;
1134
    RAMBlock *block = pss->block;
1135
    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1136

1137
    p = block->host + offset;
1138

J
Juan Quintela 已提交
1139
    ret = ram_control_save_page(rs->f, block->offset,
1140 1141
                                offset, TARGET_PAGE_SIZE, &bytes_xmit);
    if (bytes_xmit) {
1142
        ram_counters.transferred += bytes_xmit;
1143 1144 1145 1146 1147
        pages = 1;
    }
    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
        if (ret != RAM_SAVE_CONTROL_DELAYED) {
            if (bytes_xmit > 0) {
1148
                ram_counters.normal++;
1149
            } else if (bytes_xmit == 0) {
1150
                ram_counters.duplicate++;
1151 1152 1153 1154 1155 1156 1157 1158 1159
            }
        }
    } else {
        /* When starting the process of a new block, the first page of
         * the block should be sent out before other pages in the same
         * block, and all the pages in last block should have been sent
         * out, keeping this order is important, because the 'cont' flag
         * is used to avoid resending the block name.
         */
J
Juan Quintela 已提交
1160
        if (block != rs->last_sent_block) {
J
Juan Quintela 已提交
1161 1162
            flush_compressed_data(rs);
            pages = save_zero_page(rs, block, offset, p);
1163
            if (pages == -1) {
1164
                /* Make sure the first page is sent out before other pages */
1165
                bytes_xmit = save_page_header(rs, rs->f, block, offset |
1166
                                              RAM_SAVE_FLAG_COMPRESS_PAGE);
J
Juan Quintela 已提交
1167
                blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1168 1169
                                                 migrate_compress_level());
                if (blen > 0) {
1170 1171
                    ram_counters.transferred += bytes_xmit + blen;
                    ram_counters.normal++;
1172
                    pages = 1;
1173
                } else {
J
Juan Quintela 已提交
1174
                    qemu_file_set_error(rs->f, blen);
1175
                    error_report("compressed data failed!");
1176
                }
1177
            }
1178
            if (pages > 0) {
1179
                ram_release_pages(block->idstr, offset, pages);
1180
            }
1181
        } else {
J
Juan Quintela 已提交
1182
            pages = save_zero_page(rs, block, offset, p);
1183
            if (pages == -1) {
J
Juan Quintela 已提交
1184
                pages = compress_page_with_multi_thread(rs, block, offset);
1185
            } else {
1186
                ram_release_pages(block->idstr, offset, pages);
1187 1188 1189 1190 1191 1192 1193
            }
        }
    }

    return pages;
}

1194 1195 1196
/**
 * find_dirty_block: find the next dirty page and update any state
 * associated with the search process.
1197
 *
1198
 * Returns if a page is found
1199
 *
J
Juan Quintela 已提交
1200
 * @rs: current RAM state
1201 1202
 * @pss: data about the state of the current dirty page scan
 * @again: set to false if the search has scanned the whole of RAM
1203
 */
1204
static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1205
{
1206
    pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
J
Juan Quintela 已提交
1207
    if (pss->complete_round && pss->block == rs->last_seen_block &&
1208
        pss->page >= rs->last_page) {
1209 1210 1211 1212 1213 1214 1215
        /*
         * We've been once around the RAM and haven't found anything.
         * Give up.
         */
        *again = false;
        return false;
    }
1216
    if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1217
        /* Didn't find anything in this RAM Block */
1218
        pss->page = 0;
1219 1220 1221 1222 1223 1224
        pss->block = QLIST_NEXT_RCU(pss->block, next);
        if (!pss->block) {
            /* Hit the end of the list */
            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
            /* Flag that we've looped */
            pss->complete_round = true;
J
Juan Quintela 已提交
1225
            rs->ram_bulk_stage = false;
1226 1227 1228 1229
            if (migrate_use_xbzrle()) {
                /* If xbzrle is on, stop using the data compression at this
                 * point. In theory, xbzrle can do better than compression.
                 */
J
Juan Quintela 已提交
1230
                flush_compressed_data(rs);
1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243
            }
        }
        /* Didn't find anything this time, but try again on the new block */
        *again = true;
        return false;
    } else {
        /* Can go around again, but... */
        *again = true;
        /* We've found something so probably don't need to */
        return true;
    }
}

1244 1245 1246
/**
 * unqueue_page: gets a page of the queue
 *
1247 1248
 * Helper for 'get_queued_page' - gets a page off the queue
 *
1249 1250
 * Returns the block of the page (or NULL if none available)
 *
1251
 * @rs: current RAM state
1252
 * @offset: used to return the offset within the RAMBlock
1253
 */
1254
static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1255 1256 1257
{
    RAMBlock *block = NULL;

1258 1259 1260 1261
    qemu_mutex_lock(&rs->src_page_req_mutex);
    if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
        struct RAMSrcPageRequest *entry =
                                QSIMPLEQ_FIRST(&rs->src_page_requests);
1262 1263 1264 1265 1266 1267 1268 1269
        block = entry->rb;
        *offset = entry->offset;

        if (entry->len > TARGET_PAGE_SIZE) {
            entry->len -= TARGET_PAGE_SIZE;
            entry->offset += TARGET_PAGE_SIZE;
        } else {
            memory_region_unref(block->mr);
1270
            QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1271 1272 1273
            g_free(entry);
        }
    }
1274
    qemu_mutex_unlock(&rs->src_page_req_mutex);
1275 1276 1277 1278

    return block;
}

1279 1280 1281 1282
/**
 * get_queued_page: unqueue a page from the postocpy requests
 *
 * Skips pages that are already sent (!dirty)
1283
 *
1284
 * Returns if a queued page is found
1285
 *
J
Juan Quintela 已提交
1286
 * @rs: current RAM state
1287
 * @pss: data about the state of the current dirty page scan
1288
 */
1289
static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1290 1291 1292 1293 1294 1295
{
    RAMBlock  *block;
    ram_addr_t offset;
    bool dirty;

    do {
1296
        block = unqueue_page(rs, &offset);
1297 1298 1299 1300 1301 1302 1303
        /*
         * We're sending this page, and since it's postcopy nothing else
         * will dirty it, and we must make sure it doesn't get sent again
         * even if this queue request was received after the background
         * search already sent it.
         */
        if (block) {
1304 1305
            unsigned long page;

1306 1307
            page = offset >> TARGET_PAGE_BITS;
            dirty = test_bit(page, block->bmap);
1308
            if (!dirty) {
1309
                trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1310
                       page, test_bit(page, block->unsentmap));
1311
            } else {
1312
                trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324
            }
        }

    } while (block && !dirty);

    if (block) {
        /*
         * As soon as we start servicing pages out of order, then we have
         * to kill the bulk stage, since the bulk stage assumes
         * in (migration_bitmap_find_and_reset_dirty) that every page is
         * dirty, that's no longer true.
         */
J
Juan Quintela 已提交
1325
        rs->ram_bulk_stage = false;
1326 1327 1328 1329 1330 1331 1332

        /*
         * We want the background search to continue from the queued page
         * since the guest is likely to want other pages near to the page
         * it just requested.
         */
        pss->block = block;
1333
        pss->page = offset >> TARGET_PAGE_BITS;
1334 1335 1336 1337 1338
    }

    return !!block;
}

1339
/**
1340 1341
 * migration_page_queue_free: drop any remaining pages in the ram
 * request queue
1342
 *
1343 1344 1345
 * It should be empty at the end anyway, but in error cases there may
 * be some left.  in case that there is any page left, we drop it.
 *
1346
 */
1347
static void migration_page_queue_free(RAMState *rs)
1348
{
1349
    struct RAMSrcPageRequest *mspr, *next_mspr;
1350 1351 1352 1353
    /* This queue generally should be empty - but in the case of a failed
     * migration might have some droppings in.
     */
    rcu_read_lock();
1354
    QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1355
        memory_region_unref(mspr->rb->mr);
1356
        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1357 1358 1359 1360 1361 1362
        g_free(mspr);
    }
    rcu_read_unlock();
}

/**
1363 1364 1365 1366 1367 1368 1369 1370 1371 1372
 * ram_save_queue_pages: queue the page for transmission
 *
 * A request from postcopy destination for example.
 *
 * Returns zero on success or negative on error
 *
 * @rbname: Name of the RAMBLock of the request. NULL means the
 *          same that last one.
 * @start: starting address from the start of the RAMBlock
 * @len: length (in bytes) to send
1373
 */
1374
int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1375 1376
{
    RAMBlock *ramblock;
J
Juan Quintela 已提交
1377
    RAMState *rs = ram_state;
1378

1379
    ram_counters.postcopy_requests++;
1380 1381 1382
    rcu_read_lock();
    if (!rbname) {
        /* Reuse last RAMBlock */
1383
        ramblock = rs->last_req_rb;
1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400

        if (!ramblock) {
            /*
             * Shouldn't happen, we can't reuse the last RAMBlock if
             * it's the 1st request.
             */
            error_report("ram_save_queue_pages no previous block");
            goto err;
        }
    } else {
        ramblock = qemu_ram_block_by_name(rbname);

        if (!ramblock) {
            /* We shouldn't be asked for a non-existent RAMBlock */
            error_report("ram_save_queue_pages no block '%s'", rbname);
            goto err;
        }
1401
        rs->last_req_rb = ramblock;
1402 1403 1404
    }
    trace_ram_save_queue_pages(ramblock->idstr, start, len);
    if (start+len > ramblock->used_length) {
1405 1406
        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1407 1408 1409 1410
                     __func__, start, len, ramblock->used_length);
        goto err;
    }

1411 1412
    struct RAMSrcPageRequest *new_entry =
        g_malloc0(sizeof(struct RAMSrcPageRequest));
1413 1414 1415 1416 1417
    new_entry->rb = ramblock;
    new_entry->offset = start;
    new_entry->len = len;

    memory_region_ref(ramblock->mr);
1418 1419 1420
    qemu_mutex_lock(&rs->src_page_req_mutex);
    QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
    qemu_mutex_unlock(&rs->src_page_req_mutex);
1421 1422 1423 1424 1425 1426 1427 1428 1429
    rcu_read_unlock();

    return 0;

err:
    rcu_read_unlock();
    return -1;
}

1430
/**
1431
 * ram_save_target_page: save one target page
1432
 *
1433
 * Returns the number of pages written
1434
 *
J
Juan Quintela 已提交
1435
 * @rs: current RAM state
1436 1437
 * @ms: current migration state
 * @pss: data about the page we want to send
1438 1439
 * @last_stage: if we are at the completion stage
 */
1440
static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1441
                                bool last_stage)
1442 1443 1444 1445
{
    int res = 0;

    /* Check the pages is dirty and if it is send it */
1446
    if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1447 1448 1449 1450 1451
        /*
         * If xbzrle is on, stop using the data compression after first
         * round of migration even if compression is enabled. In theory,
         * xbzrle can do better than compression.
         */
1452 1453
        if (migrate_use_compression() &&
            (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1454
            res = ram_save_compressed_page(rs, pss, last_stage);
1455
        } else {
1456
            res = ram_save_page(rs, pss, last_stage);
1457 1458 1459 1460 1461
        }

        if (res < 0) {
            return res;
        }
1462 1463
        if (pss->block->unsentmap) {
            clear_bit(pss->page, pss->block->unsentmap);
1464 1465 1466 1467 1468 1469 1470
        }
    }

    return res;
}

/**
1471
 * ram_save_host_page: save a whole host page
1472
 *
1473 1474 1475 1476 1477
 * Starting at *offset send pages up to the end of the current host
 * page. It's valid for the initial offset to point into the middle of
 * a host page in which case the remainder of the hostpage is sent.
 * Only dirty target pages are sent. Note that the host page size may
 * be a huge page for this block.
1478 1479
 * The saving stops at the boundary of the used_length of the block
 * if the RAMBlock isn't a multiple of the host page size.
1480
 *
1481 1482
 * Returns the number of pages written or negative on error
 *
J
Juan Quintela 已提交
1483
 * @rs: current RAM state
1484 1485
 * @ms: current migration state
 * @pss: data about the page we want to send
1486 1487
 * @last_stage: if we are at the completion stage
 */
1488
static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1489
                              bool last_stage)
1490 1491
{
    int tmppages, pages = 0;
1492 1493
    size_t pagesize_bits =
        qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1494

1495
    do {
1496
        tmppages = ram_save_target_page(rs, pss, last_stage);
1497 1498 1499 1500 1501
        if (tmppages < 0) {
            return tmppages;
        }

        pages += tmppages;
1502
        pss->page++;
1503 1504
    } while ((pss->page & (pagesize_bits - 1)) &&
             offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1505 1506

    /* The offset we leave with is the last one we looked at */
1507
    pss->page--;
1508 1509
    return pages;
}
1510

1511
/**
1512
 * ram_find_and_save_block: finds a dirty page and sends it to f
1513 1514 1515
 *
 * Called within an RCU critical section.
 *
1516
 * Returns the number of pages written where zero means no dirty pages
1517
 *
J
Juan Quintela 已提交
1518
 * @rs: current RAM state
1519
 * @last_stage: if we are at the completion stage
1520 1521 1522
 *
 * On systems where host-page-size > target-page-size it will send all the
 * pages in a host page that are dirty.
1523 1524
 */

J
Juan Quintela 已提交
1525
static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1526
{
1527
    PageSearchStatus pss;
1528
    int pages = 0;
1529
    bool again, found;
1530

1531 1532 1533 1534 1535
    /* No dirty page as there is zero RAM */
    if (!ram_bytes_total()) {
        return pages;
    }

J
Juan Quintela 已提交
1536
    pss.block = rs->last_seen_block;
1537
    pss.page = rs->last_page;
1538 1539 1540 1541 1542
    pss.complete_round = false;

    if (!pss.block) {
        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
    }
1543

1544
    do {
1545
        again = true;
1546
        found = get_queued_page(rs, &pss);
1547

1548 1549
        if (!found) {
            /* priority queue empty, so just search for something dirty */
1550
            found = find_dirty_block(rs, &pss, &again);
1551
        }
1552

1553
        if (found) {
1554
            pages = ram_save_host_page(rs, &pss, last_stage);
1555
        }
1556
    } while (!pages && again);
1557

J
Juan Quintela 已提交
1558
    rs->last_seen_block = pss.block;
1559
    rs->last_page = pss.page;
1560 1561 1562 1563 1564 1565 1566

    return pages;
}

void acct_update_position(QEMUFile *f, size_t size, bool zero)
{
    uint64_t pages = size / TARGET_PAGE_SIZE;
1567

1568
    if (zero) {
1569
        ram_counters.duplicate += pages;
1570
    } else {
1571 1572
        ram_counters.normal += pages;
        ram_counters.transferred += size;
1573 1574 1575 1576 1577 1578 1579 1580 1581 1582
        qemu_update_position(f, size);
    }
}

uint64_t ram_bytes_total(void)
{
    RAMBlock *block;
    uint64_t total = 0;

    rcu_read_lock();
P
Peter Xu 已提交
1583
    RAMBLOCK_FOREACH(block) {
1584
        total += block->used_length;
P
Peter Xu 已提交
1585
    }
1586 1587 1588 1589
    rcu_read_unlock();
    return total;
}

1590
static void xbzrle_load_setup(void)
1591
{
1592
    XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1593 1594
}

1595 1596 1597 1598 1599 1600
static void xbzrle_load_cleanup(void)
{
    g_free(XBZRLE.decoded_buf);
    XBZRLE.decoded_buf = NULL;
}

P
Peter Xu 已提交
1601 1602 1603 1604 1605 1606 1607 1608 1609
static void ram_state_cleanup(RAMState **rsp)
{
    migration_page_queue_free(*rsp);
    qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
    qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
    g_free(*rsp);
    *rsp = NULL;
}

1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625
static void xbzrle_cleanup(void)
{
    XBZRLE_cache_lock();
    if (XBZRLE.cache) {
        cache_fini(XBZRLE.cache);
        g_free(XBZRLE.encoded_buf);
        g_free(XBZRLE.current_buf);
        g_free(XBZRLE.zero_target_page);
        XBZRLE.cache = NULL;
        XBZRLE.encoded_buf = NULL;
        XBZRLE.current_buf = NULL;
        XBZRLE.zero_target_page = NULL;
    }
    XBZRLE_cache_unlock();
}

1626
static void ram_save_cleanup(void *opaque)
1627
{
J
Juan Quintela 已提交
1628
    RAMState **rsp = opaque;
1629
    RAMBlock *block;
1630

L
Li Zhijian 已提交
1631 1632 1633
    /* caller have hold iothread lock or is in a bh, so there is
     * no writing race against this migration_bitmap
     */
1634 1635 1636 1637 1638 1639 1640
    memory_global_dirty_log_stop();

    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
        g_free(block->bmap);
        block->bmap = NULL;
        g_free(block->unsentmap);
        block->unsentmap = NULL;
1641 1642
    }

1643
    xbzrle_cleanup();
1644
    compress_threads_save_cleanup();
P
Peter Xu 已提交
1645
    ram_state_cleanup(rsp);
1646 1647
}

J
Juan Quintela 已提交
1648
static void ram_state_reset(RAMState *rs)
1649
{
J
Juan Quintela 已提交
1650 1651
    rs->last_seen_block = NULL;
    rs->last_sent_block = NULL;
1652
    rs->last_page = 0;
J
Juan Quintela 已提交
1653 1654
    rs->last_version = ram_list.version;
    rs->ram_bulk_stage = true;
1655 1656 1657 1658
}

#define MAX_WAIT 50 /* ms, half buffered_file limit */

1659 1660 1661 1662 1663
/*
 * 'expected' is the value you expect the bitmap mostly to be full
 * of; it won't bother printing lines that are all this value.
 * If 'todump' is null the migration bitmap is dumped.
 */
1664 1665
void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
                           unsigned long pages)
1666 1667 1668 1669 1670
{
    int64_t cur;
    int64_t linelen = 128;
    char linebuf[129];

1671
    for (cur = 0; cur < pages; cur += linelen) {
1672 1673 1674 1675 1676 1677
        int64_t curb;
        bool found = false;
        /*
         * Last line; catch the case where the line length
         * is longer than remaining ram
         */
1678 1679
        if (cur + linelen > pages) {
            linelen = pages - cur;
1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692
        }
        for (curb = 0; curb < linelen; curb++) {
            bool thisbit = test_bit(cur + curb, todump);
            linebuf[curb] = thisbit ? '1' : '.';
            found = found || (thisbit != expected);
        }
        if (found) {
            linebuf[curb] = '\0';
            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
        }
    }
}

1693 1694
/* **** functions for postcopy ***** */

1695 1696 1697 1698
void ram_postcopy_migrated_memory_release(MigrationState *ms)
{
    struct RAMBlock *block;

P
Peter Xu 已提交
1699
    RAMBLOCK_FOREACH(block) {
1700 1701 1702
        unsigned long *bitmap = block->bmap;
        unsigned long range = block->used_length >> TARGET_PAGE_BITS;
        unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1703 1704 1705

        while (run_start < range) {
            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1706
            ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1707 1708 1709 1710 1711 1712
                              (run_end - run_start) << TARGET_PAGE_BITS);
            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
        }
    }
}

1713 1714 1715 1716 1717
/**
 * postcopy_send_discard_bm_ram: discard a RAMBlock
 *
 * Returns zero on success
 *
1718 1719 1720
 * Callback from postcopy_each_ram_send_discard for each RAMBlock
 * Note: At this point the 'unsentmap' is the processed bitmap combined
 *       with the dirtymap; so a '1' means it's either dirty or unsent.
1721 1722 1723 1724 1725
 *
 * @ms: current migration state
 * @pds: state for postcopy
 * @start: RAMBlock starting page
 * @length: RAMBlock size
1726 1727 1728
 */
static int postcopy_send_discard_bm_ram(MigrationState *ms,
                                        PostcopyDiscardState *pds,
1729
                                        RAMBlock *block)
1730
{
1731
    unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1732
    unsigned long current;
1733
    unsigned long *unsentmap = block->unsentmap;
1734

1735
    for (current = 0; current < end; ) {
1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746
        unsigned long one = find_next_bit(unsentmap, end, current);

        if (one <= end) {
            unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
            unsigned long discard_length;

            if (zero >= end) {
                discard_length = end - one;
            } else {
                discard_length = zero - one;
            }
1747 1748 1749
            if (discard_length) {
                postcopy_discard_send_range(ms, pds, one, discard_length);
            }
1750 1751 1752 1753 1754 1755 1756 1757 1758
            current = one + discard_length;
        } else {
            current = one;
        }
    }

    return 0;
}

1759 1760 1761 1762 1763
/**
 * postcopy_each_ram_send_discard: discard all RAMBlocks
 *
 * Returns 0 for success or negative for error
 *
1764 1765 1766 1767 1768
 * Utility for the outgoing postcopy code.
 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
 *   passing it bitmap indexes and name.
 * (qemu_ram_foreach_block ends up passing unscaled lengths
 *  which would mean postcopy code would have to deal with target page)
1769 1770
 *
 * @ms: current migration state
1771 1772 1773 1774 1775 1776
 */
static int postcopy_each_ram_send_discard(MigrationState *ms)
{
    struct RAMBlock *block;
    int ret;

P
Peter Xu 已提交
1777
    RAMBLOCK_FOREACH(block) {
1778 1779
        PostcopyDiscardState *pds =
            postcopy_discard_send_init(ms, block->idstr);
1780 1781 1782 1783 1784 1785

        /*
         * Postcopy sends chunks of bitmap over the wire, but it
         * just needs indexes at this point, avoids it having
         * target page specific code.
         */
1786
        ret = postcopy_send_discard_bm_ram(ms, pds, block);
1787 1788 1789 1790 1791 1792 1793 1794 1795
        postcopy_discard_send_finish(ms, pds);
        if (ret) {
            return ret;
        }
    }

    return 0;
}

1796 1797 1798 1799 1800 1801
/**
 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
 *
 * Helper for postcopy_chunk_hostpages; it's called twice to
 * canonicalize the two bitmaps, that are similar, but one is
 * inverted.
1802
 *
1803 1804
 * Postcopy requires that all target pages in a hostpage are dirty or
 * clean, not a mix.  This function canonicalizes the bitmaps.
1805
 *
1806 1807 1808 1809 1810
 * @ms: current migration state
 * @unsent_pass: if true we need to canonicalize partially unsent host pages
 *               otherwise we need to canonicalize partially dirty host pages
 * @block: block that contains the page we want to canonicalize
 * @pds: state for postcopy
1811 1812 1813 1814 1815
 */
static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
                                          RAMBlock *block,
                                          PostcopyDiscardState *pds)
{
J
Juan Quintela 已提交
1816
    RAMState *rs = ram_state;
1817 1818
    unsigned long *bitmap = block->bmap;
    unsigned long *unsentmap = block->unsentmap;
1819
    unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1820
    unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1821 1822
    unsigned long run_start;

1823 1824 1825 1826 1827
    if (block->page_size == TARGET_PAGE_SIZE) {
        /* Easy case - TPS==HPS for a non-huge page RAMBlock */
        return;
    }

1828 1829
    if (unsent_pass) {
        /* Find a sent page */
1830
        run_start = find_next_zero_bit(unsentmap, pages, 0);
1831 1832
    } else {
        /* Find a dirty page */
1833
        run_start = find_next_bit(bitmap, pages, 0);
1834 1835
    }

1836
    while (run_start < pages) {
1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855
        bool do_fixup = false;
        unsigned long fixup_start_addr;
        unsigned long host_offset;

        /*
         * If the start of this run of pages is in the middle of a host
         * page, then we need to fixup this host page.
         */
        host_offset = run_start % host_ratio;
        if (host_offset) {
            do_fixup = true;
            run_start -= host_offset;
            fixup_start_addr = run_start;
            /* For the next pass */
            run_start = run_start + host_ratio;
        } else {
            /* Find the end of this run */
            unsigned long run_end;
            if (unsent_pass) {
1856
                run_end = find_next_bit(unsentmap, pages, run_start + 1);
1857
            } else {
1858
                run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908
            }
            /*
             * If the end isn't at the start of a host page, then the
             * run doesn't finish at the end of a host page
             * and we need to discard.
             */
            host_offset = run_end % host_ratio;
            if (host_offset) {
                do_fixup = true;
                fixup_start_addr = run_end - host_offset;
                /*
                 * This host page has gone, the next loop iteration starts
                 * from after the fixup
                 */
                run_start = fixup_start_addr + host_ratio;
            } else {
                /*
                 * No discards on this iteration, next loop starts from
                 * next sent/dirty page
                 */
                run_start = run_end + 1;
            }
        }

        if (do_fixup) {
            unsigned long page;

            /* Tell the destination to discard this page */
            if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
                /* For the unsent_pass we:
                 *     discard partially sent pages
                 * For the !unsent_pass (dirty) we:
                 *     discard partially dirty pages that were sent
                 *     (any partially sent pages were already discarded
                 *     by the previous unsent_pass)
                 */
                postcopy_discard_send_range(ms, pds, fixup_start_addr,
                                            host_ratio);
            }

            /* Clean up the bitmap */
            for (page = fixup_start_addr;
                 page < fixup_start_addr + host_ratio; page++) {
                /* All pages in this host page are now not sent */
                set_bit(page, unsentmap);

                /*
                 * Remark them as dirty, updating the count for any pages
                 * that weren't previously dirty.
                 */
1909
                rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1910 1911 1912 1913 1914
            }
        }

        if (unsent_pass) {
            /* Find the next sent page for the next iteration */
1915
            run_start = find_next_zero_bit(unsentmap, pages, run_start);
1916 1917
        } else {
            /* Find the next dirty page for the next iteration */
1918
            run_start = find_next_bit(bitmap, pages, run_start);
1919 1920 1921 1922
        }
    }
}

1923 1924 1925
/**
 * postcopy_chuck_hostpages: discrad any partially sent host page
 *
1926 1927 1928
 * Utility for the outgoing postcopy code.
 *
 * Discard any partially sent host-page size chunks, mark any partially
1929 1930
 * dirty host-page size chunks as all dirty.  In this case the host-page
 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1931
 *
1932 1933 1934
 * Returns zero on success
 *
 * @ms: current migration state
1935
 * @block: block we want to work with
1936
 */
1937
static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1938
{
1939 1940
    PostcopyDiscardState *pds =
        postcopy_discard_send_init(ms, block->idstr);
1941

1942 1943 1944 1945 1946 1947 1948
    /* First pass: Discard all partially sent host pages */
    postcopy_chunk_hostpages_pass(ms, true, block, pds);
    /*
     * Second pass: Ensure that all partially dirty host pages are made
     * fully dirty.
     */
    postcopy_chunk_hostpages_pass(ms, false, block, pds);
1949

1950
    postcopy_discard_send_finish(ms, pds);
1951 1952 1953
    return 0;
}

1954 1955 1956 1957 1958
/**
 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
 *
 * Returns zero on success
 *
1959 1960 1961 1962 1963 1964 1965
 * Transmit the set of pages to be discarded after precopy to the target
 * these are pages that:
 *     a) Have been previously transmitted but are now dirty again
 *     b) Pages that have never been transmitted, this ensures that
 *        any pages on the destination that have been mapped by background
 *        tasks get discarded (transparent huge pages is the specific concern)
 * Hopefully this is pretty sparse
1966 1967
 *
 * @ms: current migration state
1968 1969 1970
 */
int ram_postcopy_send_discard_bitmap(MigrationState *ms)
{
J
Juan Quintela 已提交
1971
    RAMState *rs = ram_state;
1972
    RAMBlock *block;
1973 1974 1975 1976 1977
    int ret;

    rcu_read_lock();

    /* This should be our last sync, the src is now paused */
1978
    migration_bitmap_sync(rs);
1979

1980 1981 1982 1983
    /* Easiest way to make sure we don't resume in the middle of a host-page */
    rs->last_seen_block = NULL;
    rs->last_sent_block = NULL;
    rs->last_page = 0;
1984

1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
        unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
        unsigned long *bitmap = block->bmap;
        unsigned long *unsentmap = block->unsentmap;

        if (!unsentmap) {
            /* We don't have a safe way to resize the sentmap, so
             * if the bitmap was resized it will be NULL at this
             * point.
             */
            error_report("migration ram resized during precopy phase");
            rcu_read_unlock();
            return -EINVAL;
        }
        /* Deal with TPS != HPS and huge pages */
        ret = postcopy_chunk_hostpages(ms, block);
        if (ret) {
            rcu_read_unlock();
            return ret;
        }
2005

2006 2007 2008 2009
        /*
         * Update the unsentmap to be unsentmap = unsentmap | dirty
         */
        bitmap_or(unsentmap, unsentmap, bitmap, pages);
2010
#ifdef DEBUG_POSTCOPY
2011
        ram_debug_dump_bitmap(unsentmap, true, pages);
2012
#endif
2013 2014
    }
    trace_ram_postcopy_send_discard_bitmap();
2015 2016 2017 2018 2019 2020 2021

    ret = postcopy_each_ram_send_discard(ms);
    rcu_read_unlock();

    return ret;
}

2022 2023
/**
 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2024
 *
2025
 * Returns zero on success
2026
 *
J
Juan Quintela 已提交
2027 2028
 * @rbname: name of the RAMBlock of the request. NULL means the
 *          same that last one.
2029 2030
 * @start: RAMBlock starting page
 * @length: RAMBlock size
2031
 */
2032
int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2033 2034 2035
{
    int ret = -1;

J
Juan Quintela 已提交
2036
    trace_ram_discard_range(rbname, start, length);
2037

2038
    rcu_read_lock();
J
Juan Quintela 已提交
2039
    RAMBlock *rb = qemu_ram_block_by_name(rbname);
2040 2041

    if (!rb) {
J
Juan Quintela 已提交
2042
        error_report("ram_discard_range: Failed to find block '%s'", rbname);
2043 2044 2045
        goto err;
    }

2046 2047
    bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
                 length >> qemu_target_page_bits());
2048
    ret = ram_block_discard_range(rb, start, length);
2049 2050 2051 2052 2053 2054 2055

err:
    rcu_read_unlock();

    return ret;
}

2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112
/*
 * For every allocation, we will try not to crash the VM if the
 * allocation failed.
 */
static int xbzrle_init(void)
{
    Error *local_err = NULL;

    if (!migrate_use_xbzrle()) {
        return 0;
    }

    XBZRLE_cache_lock();

    XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
    if (!XBZRLE.zero_target_page) {
        error_report("%s: Error allocating zero page", __func__);
        goto err_out;
    }

    XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
                              TARGET_PAGE_SIZE, &local_err);
    if (!XBZRLE.cache) {
        error_report_err(local_err);
        goto free_zero_page;
    }

    XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
    if (!XBZRLE.encoded_buf) {
        error_report("%s: Error allocating encoded_buf", __func__);
        goto free_cache;
    }

    XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
    if (!XBZRLE.current_buf) {
        error_report("%s: Error allocating current_buf", __func__);
        goto free_encoded_buf;
    }

    /* We are all good */
    XBZRLE_cache_unlock();
    return 0;

free_encoded_buf:
    g_free(XBZRLE.encoded_buf);
    XBZRLE.encoded_buf = NULL;
free_cache:
    cache_fini(XBZRLE.cache);
    XBZRLE.cache = NULL;
free_zero_page:
    g_free(XBZRLE.zero_target_page);
    XBZRLE.zero_target_page = NULL;
err_out:
    XBZRLE_cache_unlock();
    return -ENOMEM;
}

J
Juan Quintela 已提交
2113
static int ram_state_init(RAMState **rsp)
2114
{
P
Peter Xu 已提交
2115 2116 2117 2118 2119 2120
    *rsp = g_try_new0(RAMState, 1);

    if (!*rsp) {
        error_report("%s: Init ramstate fail", __func__);
        return -1;
    }
J
Juan Quintela 已提交
2121 2122 2123 2124

    qemu_mutex_init(&(*rsp)->bitmap_mutex);
    qemu_mutex_init(&(*rsp)->src_page_req_mutex);
    QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2125

P
Peter Xu 已提交
2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136
    /*
     * Count the total number of pages used by ram blocks not including any
     * gaps due to alignment or unplugs.
     */
    (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;

    ram_state_reset(*rsp);

    return 0;
}

P
Peter Xu 已提交
2137
static void ram_list_init_bitmaps(void)
P
Peter Xu 已提交
2138
{
P
Peter Xu 已提交
2139 2140
    RAMBlock *block;
    unsigned long pages;
2141

2142 2143
    /* Skip setting bitmap if there is no RAM */
    if (ram_bytes_total()) {
2144
        QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
P
Peter Xu 已提交
2145
            pages = block->max_length >> TARGET_PAGE_BITS;
2146 2147 2148 2149 2150 2151
            block->bmap = bitmap_new(pages);
            bitmap_set(block->bmap, 0, pages);
            if (migrate_postcopy_ram()) {
                block->unsentmap = bitmap_new(pages);
                bitmap_set(block->unsentmap, 0, pages);
            }
2152
        }
2153
    }
P
Peter Xu 已提交
2154 2155 2156 2157 2158 2159 2160 2161
}

static void ram_init_bitmaps(RAMState *rs)
{
    /* For memory_global_dirty_log_start below.  */
    qemu_mutex_lock_iothread();
    qemu_mutex_lock_ramlist();
    rcu_read_lock();
2162

P
Peter Xu 已提交
2163
    ram_list_init_bitmaps();
2164
    memory_global_dirty_log_start();
P
Peter Xu 已提交
2165 2166 2167
    migration_bitmap_sync(rs);

    rcu_read_unlock();
2168
    qemu_mutex_unlock_ramlist();
2169
    qemu_mutex_unlock_iothread();
P
Peter Xu 已提交
2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183
}

static int ram_init_all(RAMState **rsp)
{
    if (ram_state_init(rsp)) {
        return -1;
    }

    if (xbzrle_init()) {
        ram_state_cleanup(rsp);
        return -1;
    }

    ram_init_bitmaps(*rsp);
2184 2185 2186 2187

    return 0;
}

2188 2189
/*
 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2190 2191 2192 2193 2194
 * long-running RCU critical section.  When rcu-reclaims in the code
 * start to become numerous it will be necessary to reduce the
 * granularity of these critical sections.
 */

2195 2196 2197 2198 2199 2200 2201 2202
/**
 * ram_save_setup: Setup RAM for migration
 *
 * Returns zero to indicate success and negative for error
 *
 * @f: QEMUFile where to send the data
 * @opaque: RAMState pointer
 */
2203 2204
static int ram_save_setup(QEMUFile *f, void *opaque)
{
J
Juan Quintela 已提交
2205
    RAMState **rsp = opaque;
2206 2207 2208 2209
    RAMBlock *block;

    /* migration has already setup the bitmap, reuse it. */
    if (!migration_in_colo_state()) {
P
Peter Xu 已提交
2210
        if (ram_init_all(rsp) != 0) {
2211
            return -1;
J
Juan Quintela 已提交
2212
        }
2213
    }
J
Juan Quintela 已提交
2214
    (*rsp)->f = f;
2215 2216

    rcu_read_lock();
2217 2218 2219

    qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);

P
Peter Xu 已提交
2220
    RAMBLOCK_FOREACH(block) {
2221 2222 2223
        qemu_put_byte(f, strlen(block->idstr));
        qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
        qemu_put_be64(f, block->used_length);
2224 2225 2226
        if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
            qemu_put_be64(f, block->page_size);
        }
2227 2228 2229
    }

    rcu_read_unlock();
2230
    compress_threads_save_setup();
2231 2232 2233 2234 2235 2236 2237 2238 2239

    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
    ram_control_after_iterate(f, RAM_CONTROL_SETUP);

    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

    return 0;
}

2240 2241 2242 2243 2244 2245 2246 2247
/**
 * ram_save_iterate: iterative stage for migration
 *
 * Returns zero to indicate success and negative for error
 *
 * @f: QEMUFile where to send the data
 * @opaque: RAMState pointer
 */
2248 2249
static int ram_save_iterate(QEMUFile *f, void *opaque)
{
J
Juan Quintela 已提交
2250 2251
    RAMState **temp = opaque;
    RAMState *rs = *temp;
2252 2253 2254
    int ret;
    int i;
    int64_t t0;
2255
    int done = 0;
2256 2257

    rcu_read_lock();
J
Juan Quintela 已提交
2258 2259
    if (ram_list.version != rs->last_version) {
        ram_state_reset(rs);
2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271
    }

    /* Read version before ram_list.blocks */
    smp_rmb();

    ram_control_before_iterate(f, RAM_CONTROL_ROUND);

    t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
    i = 0;
    while ((ret = qemu_file_rate_limit(f)) == 0) {
        int pages;

J
Juan Quintela 已提交
2272
        pages = ram_find_and_save_block(rs, false);
2273 2274
        /* no more pages to sent */
        if (pages == 0) {
2275
            done = 1;
2276 2277
            break;
        }
2278
        rs->iterations++;
2279

2280 2281 2282 2283 2284 2285 2286 2287
        /* we want to check in the 1st loop, just in case it was the 1st time
           and we had to sync the dirty bitmap.
           qemu_get_clock_ns() is a bit expensive, so we only check each some
           iterations
        */
        if ((i & 63) == 0) {
            uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
            if (t1 > MAX_WAIT) {
2288
                trace_ram_save_iterate_big_wait(t1, i);
2289 2290 2291 2292 2293
                break;
            }
        }
        i++;
    }
J
Juan Quintela 已提交
2294
    flush_compressed_data(rs);
2295 2296 2297 2298 2299 2300 2301 2302 2303
    rcu_read_unlock();

    /*
     * Must occur before EOS (or any QEMUFile operation)
     * because of RDMA protocol.
     */
    ram_control_after_iterate(f, RAM_CONTROL_ROUND);

    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2304
    ram_counters.transferred += 8;
2305 2306 2307 2308 2309 2310

    ret = qemu_file_get_error(f);
    if (ret < 0) {
        return ret;
    }

2311
    return done;
2312 2313
}

2314 2315 2316 2317 2318 2319 2320 2321 2322 2323
/**
 * ram_save_complete: function called to send the remaining amount of ram
 *
 * Returns zero to indicate success
 *
 * Called with iothread lock
 *
 * @f: QEMUFile where to send the data
 * @opaque: RAMState pointer
 */
2324 2325
static int ram_save_complete(QEMUFile *f, void *opaque)
{
J
Juan Quintela 已提交
2326 2327
    RAMState **temp = opaque;
    RAMState *rs = *temp;
J
Juan Quintela 已提交
2328

2329 2330
    rcu_read_lock();

2331
    if (!migration_in_postcopy()) {
2332
        migration_bitmap_sync(rs);
2333
    }
2334 2335 2336 2337 2338 2339 2340 2341 2342

    ram_control_before_iterate(f, RAM_CONTROL_FINISH);

    /* try transferring iterative blocks of memory */

    /* flush all remaining blocks regardless of rate limiting */
    while (true) {
        int pages;

J
Juan Quintela 已提交
2343
        pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2344 2345 2346 2347 2348 2349
        /* no more blocks to sent */
        if (pages == 0) {
            break;
        }
    }

J
Juan Quintela 已提交
2350
    flush_compressed_data(rs);
2351 2352 2353
    ram_control_after_iterate(f, RAM_CONTROL_FINISH);

    rcu_read_unlock();
P
Paolo Bonzini 已提交
2354

2355 2356 2357 2358 2359
    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

    return 0;
}

2360 2361 2362
static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
                             uint64_t *non_postcopiable_pending,
                             uint64_t *postcopiable_pending)
2363
{
J
Juan Quintela 已提交
2364 2365
    RAMState **temp = opaque;
    RAMState *rs = *temp;
2366 2367
    uint64_t remaining_size;

J
Juan Quintela 已提交
2368
    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2369

2370
    if (!migration_in_postcopy() &&
2371
        remaining_size < max_size) {
2372 2373
        qemu_mutex_lock_iothread();
        rcu_read_lock();
2374
        migration_bitmap_sync(rs);
2375 2376
        rcu_read_unlock();
        qemu_mutex_unlock_iothread();
J
Juan Quintela 已提交
2377
        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2378
    }
2379

2380 2381 2382 2383 2384 2385
    if (migrate_postcopy_ram()) {
        /* We can do postcopy, and all the data is postcopiable */
        *postcopiable_pending += remaining_size;
    } else {
        *non_postcopiable_pending += remaining_size;
    }
2386 2387 2388 2389 2390 2391
}

static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
{
    unsigned int xh_len;
    int xh_flags;
2392
    uint8_t *loaded_data;
2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406

    /* extract RLE header */
    xh_flags = qemu_get_byte(f);
    xh_len = qemu_get_be16(f);

    if (xh_flags != ENCODING_FLAG_XBZRLE) {
        error_report("Failed to load XBZRLE page - wrong compression!");
        return -1;
    }

    if (xh_len > TARGET_PAGE_SIZE) {
        error_report("Failed to load XBZRLE page - len overflow!");
        return -1;
    }
2407
    loaded_data = XBZRLE.decoded_buf;
2408
    /* load data and decode */
2409
    /* it can change loaded_data to point to an internal buffer */
2410
    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2411 2412

    /* decode RLE */
2413
    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2414 2415 2416 2417 2418 2419 2420 2421
                             TARGET_PAGE_SIZE) == -1) {
        error_report("Failed to load XBZRLE page - decode error!");
        return -1;
    }

    return 0;
}

2422 2423 2424 2425 2426
/**
 * ram_block_from_stream: read a RAMBlock id from the migration stream
 *
 * Must be called from within a rcu critical section.
 *
2427
 * Returns a pointer from within the RCU-protected ram_list.
2428
 *
2429 2430
 * @f: QEMUFile where to read the data from
 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2431
 */
2432
static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2433 2434 2435 2436 2437 2438
{
    static RAMBlock *block = NULL;
    char id[256];
    uint8_t len;

    if (flags & RAM_SAVE_FLAG_CONTINUE) {
2439
        if (!block) {
2440 2441 2442
            error_report("Ack, bad migration stream!");
            return NULL;
        }
2443
        return block;
2444 2445 2446 2447 2448 2449
    }

    len = qemu_get_byte(f);
    qemu_get_buffer(f, (uint8_t *)id, len);
    id[len] = 0;

D
Dr. David Alan Gilbert 已提交
2450
    block = qemu_ram_block_by_name(id);
2451 2452 2453
    if (!block) {
        error_report("Can't find block %s", id);
        return NULL;
2454 2455
    }

2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466
    return block;
}

static inline void *host_from_ram_block_offset(RAMBlock *block,
                                               ram_addr_t offset)
{
    if (!offset_in_ramblock(block, offset)) {
        return NULL;
    }

    return block->host + offset;
2467 2468
}

2469 2470 2471
/**
 * ram_handle_compressed: handle the zero page case
 *
2472 2473
 * If a page (or a whole RDMA chunk) has been
 * determined to be zero, then zap it.
2474 2475 2476 2477
 *
 * @host: host address for the zero page
 * @ch: what the page is filled from.  We only support zero
 * @size: size of the zero page
2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489
 */
void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
{
    if (ch != 0 || !is_zero_range(host, size)) {
        memset(host, ch, size);
    }
}

static void *do_data_decompress(void *opaque)
{
    DecompressParam *param = opaque;
    unsigned long pagesize;
2490 2491
    uint8_t *des;
    int len;
2492

2493
    qemu_mutex_lock(&param->mutex);
2494
    while (!param->quit) {
2495 2496 2497 2498 2499 2500
        if (param->des) {
            des = param->des;
            len = param->len;
            param->des = 0;
            qemu_mutex_unlock(&param->mutex);

2501
            pagesize = TARGET_PAGE_SIZE;
2502 2503 2504 2505 2506
            /* uncompress() will return failed in some case, especially
             * when the page is dirted when doing the compression, it's
             * not a problem because the dirty page will be retransferred
             * and uncompress() won't break the data in other pages.
             */
2507 2508
            uncompress((Bytef *)des, &pagesize,
                       (const Bytef *)param->compbuf, len);
2509

2510 2511 2512 2513 2514 2515 2516 2517 2518
            qemu_mutex_lock(&decomp_done_lock);
            param->done = true;
            qemu_cond_signal(&decomp_done_cond);
            qemu_mutex_unlock(&decomp_done_lock);

            qemu_mutex_lock(&param->mutex);
        } else {
            qemu_cond_wait(&param->cond, &param->mutex);
        }
2519
    }
2520
    qemu_mutex_unlock(&param->mutex);
2521 2522 2523 2524

    return NULL;
}

L
Liang Li 已提交
2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542
static void wait_for_decompress_done(void)
{
    int idx, thread_count;

    if (!migrate_use_compression()) {
        return;
    }

    thread_count = migrate_decompress_threads();
    qemu_mutex_lock(&decomp_done_lock);
    for (idx = 0; idx < thread_count; idx++) {
        while (!decomp_param[idx].done) {
            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
        }
    }
    qemu_mutex_unlock(&decomp_done_lock);
}

2543
static void compress_threads_load_setup(void)
2544 2545 2546
{
    int i, thread_count;

2547 2548 2549
    if (!migrate_use_compression()) {
        return;
    }
2550 2551 2552
    thread_count = migrate_decompress_threads();
    decompress_threads = g_new0(QemuThread, thread_count);
    decomp_param = g_new0(DecompressParam, thread_count);
2553 2554
    qemu_mutex_init(&decomp_done_lock);
    qemu_cond_init(&decomp_done_cond);
2555 2556 2557 2558
    for (i = 0; i < thread_count; i++) {
        qemu_mutex_init(&decomp_param[i].mutex);
        qemu_cond_init(&decomp_param[i].cond);
        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2559
        decomp_param[i].done = true;
2560
        decomp_param[i].quit = false;
2561 2562 2563 2564 2565 2566
        qemu_thread_create(decompress_threads + i, "decompress",
                           do_data_decompress, decomp_param + i,
                           QEMU_THREAD_JOINABLE);
    }
}

2567
static void compress_threads_load_cleanup(void)
2568 2569 2570
{
    int i, thread_count;

2571 2572 2573
    if (!migrate_use_compression()) {
        return;
    }
2574 2575 2576
    thread_count = migrate_decompress_threads();
    for (i = 0; i < thread_count; i++) {
        qemu_mutex_lock(&decomp_param[i].mutex);
2577
        decomp_param[i].quit = true;
2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592
        qemu_cond_signal(&decomp_param[i].cond);
        qemu_mutex_unlock(&decomp_param[i].mutex);
    }
    for (i = 0; i < thread_count; i++) {
        qemu_thread_join(decompress_threads + i);
        qemu_mutex_destroy(&decomp_param[i].mutex);
        qemu_cond_destroy(&decomp_param[i].cond);
        g_free(decomp_param[i].compbuf);
    }
    g_free(decompress_threads);
    g_free(decomp_param);
    decompress_threads = NULL;
    decomp_param = NULL;
}

2593
static void decompress_data_with_multi_threads(QEMUFile *f,
2594 2595 2596 2597 2598
                                               void *host, int len)
{
    int idx, thread_count;

    thread_count = migrate_decompress_threads();
2599
    qemu_mutex_lock(&decomp_done_lock);
2600 2601
    while (true) {
        for (idx = 0; idx < thread_count; idx++) {
2602
            if (decomp_param[idx].done) {
2603 2604
                decomp_param[idx].done = false;
                qemu_mutex_lock(&decomp_param[idx].mutex);
2605
                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2606 2607
                decomp_param[idx].des = host;
                decomp_param[idx].len = len;
2608 2609
                qemu_cond_signal(&decomp_param[idx].cond);
                qemu_mutex_unlock(&decomp_param[idx].mutex);
2610 2611 2612 2613 2614
                break;
            }
        }
        if (idx < thread_count) {
            break;
2615 2616
        } else {
            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2617 2618
        }
    }
2619
    qemu_mutex_unlock(&decomp_done_lock);
2620 2621
}

2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632
/**
 * ram_load_setup: Setup RAM for migration incoming side
 *
 * Returns zero to indicate success and negative for error
 *
 * @f: QEMUFile where to receive the data
 * @opaque: RAMState pointer
 */
static int ram_load_setup(QEMUFile *f, void *opaque)
{
    xbzrle_load_setup();
2633
    compress_threads_load_setup();
2634
    ramblock_recv_map_init();
2635 2636 2637 2638 2639
    return 0;
}

static int ram_load_cleanup(void *opaque)
{
2640
    RAMBlock *rb;
2641
    xbzrle_load_cleanup();
2642
    compress_threads_load_cleanup();
2643 2644 2645 2646 2647

    RAMBLOCK_FOREACH(rb) {
        g_free(rb->receivedmap);
        rb->receivedmap = NULL;
    }
2648 2649 2650
    return 0;
}

2651 2652 2653 2654 2655 2656 2657 2658 2659 2660
/**
 * ram_postcopy_incoming_init: allocate postcopy data structures
 *
 * Returns 0 for success and negative if there was one error
 *
 * @mis: current migration incoming state
 *
 * Allocate data structures etc needed by incoming migration with
 * postcopy-ram. postcopy-ram's similarly names
 * postcopy_ram_incoming_init does the work.
2661 2662 2663
 */
int ram_postcopy_incoming_init(MigrationIncomingState *mis)
{
2664
    unsigned long ram_pages = last_ram_page();
2665 2666 2667 2668

    return postcopy_ram_incoming_init(mis, ram_pages);
}

2669 2670 2671 2672 2673
/**
 * ram_load_postcopy: load a page in postcopy case
 *
 * Returns 0 for success or -errno in case of error
 *
2674 2675
 * Called in postcopy mode by ram_load().
 * rcu_read_lock is taken prior to this being called.
2676 2677
 *
 * @f: QEMUFile where to send the data
2678 2679 2680 2681 2682
 */
static int ram_load_postcopy(QEMUFile *f)
{
    int flags = 0, ret = 0;
    bool place_needed = false;
2683
    bool matching_page_sizes = false;
2684 2685 2686
    MigrationIncomingState *mis = migration_incoming_get_current();
    /* Temporary page that is later 'placed' */
    void *postcopy_host_page = postcopy_get_tmp_page(mis);
2687
    void *last_host = NULL;
2688
    bool all_zero = false;
2689 2690 2691 2692 2693 2694

    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
        ram_addr_t addr;
        void *host = NULL;
        void *page_buffer = NULL;
        void *place_source = NULL;
2695
        RAMBlock *block = NULL;
2696 2697 2698 2699 2700 2701 2702 2703
        uint8_t ch;

        addr = qemu_get_be64(f);
        flags = addr & ~TARGET_PAGE_MASK;
        addr &= TARGET_PAGE_MASK;

        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
        place_needed = false;
2704
        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2705
            block = ram_block_from_stream(f, flags);
2706 2707

            host = host_from_ram_block_offset(block, addr);
2708 2709 2710 2711 2712
            if (!host) {
                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
                ret = -EINVAL;
                break;
            }
2713
            matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2714
            /*
2715 2716 2717
             * Postcopy requires that we place whole host pages atomically;
             * these may be huge pages for RAMBlocks that are backed by
             * hugetlbfs.
2718 2719 2720 2721 2722 2723 2724
             * To make it atomic, the data is read into a temporary page
             * that's moved into place later.
             * The migration protocol uses,  possibly smaller, target-pages
             * however the source ensures it always sends all the components
             * of a host page in order.
             */
            page_buffer = postcopy_host_page +
2725
                          ((uintptr_t)host & (block->page_size - 1));
2726
            /* If all TP are zero then we can optimise the place */
2727
            if (!((uintptr_t)host & (block->page_size - 1))) {
2728
                all_zero = true;
2729 2730 2731
            } else {
                /* not the 1st TP within the HP */
                if (host != (last_host + TARGET_PAGE_SIZE)) {
2732
                    error_report("Non-sequential target page %p/%p",
2733 2734 2735 2736
                                  host, last_host);
                    ret = -EINVAL;
                    break;
                }
2737 2738
            }

2739

2740 2741 2742 2743 2744
            /*
             * If it's the last part of a host page then we place the host
             * page
             */
            place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2745
                                     (block->page_size - 1)) == 0;
2746 2747
            place_source = postcopy_host_page;
        }
2748
        last_host = host;
2749 2750

        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2751
        case RAM_SAVE_FLAG_ZERO:
2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782
            ch = qemu_get_byte(f);
            memset(page_buffer, ch, TARGET_PAGE_SIZE);
            if (ch) {
                all_zero = false;
            }
            break;

        case RAM_SAVE_FLAG_PAGE:
            all_zero = false;
            if (!place_needed || !matching_page_sizes) {
                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
            } else {
                /* Avoids the qemu_file copy during postcopy, which is
                 * going to do a copy later; can only do it when we
                 * do this read in one go (matching page sizes)
                 */
                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
                                         TARGET_PAGE_SIZE);
            }
            break;
        case RAM_SAVE_FLAG_EOS:
            /* normal exit */
            break;
        default:
            error_report("Unknown combination of migration flags: %#x"
                         " (postcopy mode)", flags);
            ret = -EINVAL;
        }

        if (place_needed) {
            /* This gets called at the last target page in the host page */
2783 2784
            void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;

2785
            if (all_zero) {
2786
                ret = postcopy_place_page_zero(mis, place_dest,
2787
                                               block);
2788
            } else {
2789
                ret = postcopy_place_page(mis, place_dest,
2790
                                          place_source, block);
2791 2792 2793 2794 2795 2796 2797 2798 2799 2800
            }
        }
        if (!ret) {
            ret = qemu_file_get_error(f);
        }
    }

    return ret;
}

2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812
static bool postcopy_is_advised(void)
{
    PostcopyState ps = postcopy_state_get();
    return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
}

static bool postcopy_is_running(void)
{
    PostcopyState ps = postcopy_state_get();
    return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
}

2813 2814
static int ram_load(QEMUFile *f, void *opaque, int version_id)
{
2815
    int flags = 0, ret = 0, invalid_flags = 0;
2816 2817
    static uint64_t seq_iter;
    int len = 0;
2818 2819 2820 2821
    /*
     * If system is running in postcopy mode, page inserts to host memory must
     * be atomic
     */
2822
    bool postcopy_running = postcopy_is_running();
2823
    /* ADVISE is earlier, it shows the source has the postcopy capability on */
2824
    bool postcopy_advised = postcopy_is_advised();
2825 2826 2827 2828 2829 2830 2831

    seq_iter++;

    if (version_id != 4) {
        ret = -EINVAL;
    }

2832 2833 2834
    if (!migrate_use_compression()) {
        invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
    }
2835 2836 2837 2838 2839 2840
    /* This RCU critical section can be very long running.
     * When RCU reclaims in the code start to become numerous,
     * it will be necessary to reduce the granularity of this
     * critical section.
     */
    rcu_read_lock();
2841 2842 2843 2844 2845 2846

    if (postcopy_running) {
        ret = ram_load_postcopy(f);
    }

    while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2847
        ram_addr_t addr, total_ram_bytes;
2848
        void *host = NULL;
2849 2850 2851 2852 2853 2854
        uint8_t ch;

        addr = qemu_get_be64(f);
        flags = addr & ~TARGET_PAGE_MASK;
        addr &= TARGET_PAGE_MASK;

2855 2856 2857 2858 2859 2860 2861 2862 2863
        if (flags & invalid_flags) {
            if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
                error_report("Received an unexpected compressed page");
            }

            ret = -EINVAL;
            break;
        }

2864
        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2865
                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2866 2867 2868
            RAMBlock *block = ram_block_from_stream(f, flags);

            host = host_from_ram_block_offset(block, addr);
2869 2870 2871 2872 2873
            if (!host) {
                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
                ret = -EINVAL;
                break;
            }
2874
            ramblock_recv_bitmap_set(block, host);
2875
            trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2876 2877
        }

2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891
        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
        case RAM_SAVE_FLAG_MEM_SIZE:
            /* Synchronize RAM block list */
            total_ram_bytes = addr;
            while (!ret && total_ram_bytes) {
                RAMBlock *block;
                char id[256];
                ram_addr_t length;

                len = qemu_get_byte(f);
                qemu_get_buffer(f, (uint8_t *)id, len);
                id[len] = 0;
                length = qemu_get_be64(f);

D
Dr. David Alan Gilbert 已提交
2892 2893 2894 2895
                block = qemu_ram_block_by_name(id);
                if (block) {
                    if (length != block->used_length) {
                        Error *local_err = NULL;
2896

G
Gonglei 已提交
2897
                        ret = qemu_ram_resize(block, length,
D
Dr. David Alan Gilbert 已提交
2898 2899 2900
                                              &local_err);
                        if (local_err) {
                            error_report_err(local_err);
2901 2902
                        }
                    }
2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914
                    /* For postcopy we need to check hugepage sizes match */
                    if (postcopy_advised &&
                        block->page_size != qemu_host_page_size) {
                        uint64_t remote_page_size = qemu_get_be64(f);
                        if (remote_page_size != block->page_size) {
                            error_report("Mismatched RAM page size %s "
                                         "(local) %zd != %" PRId64,
                                         id, block->page_size,
                                         remote_page_size);
                            ret = -EINVAL;
                        }
                    }
D
Dr. David Alan Gilbert 已提交
2915 2916 2917
                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
                                          block->idstr);
                } else {
2918 2919 2920 2921 2922 2923 2924 2925
                    error_report("Unknown ramblock \"%s\", cannot "
                                 "accept migration", id);
                    ret = -EINVAL;
                }

                total_ram_bytes -= length;
            }
            break;
2926

2927
        case RAM_SAVE_FLAG_ZERO:
2928 2929 2930
            ch = qemu_get_byte(f);
            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
            break;
2931

2932 2933 2934 2935
        case RAM_SAVE_FLAG_PAGE:
            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
            break;

2936
        case RAM_SAVE_FLAG_COMPRESS_PAGE:
2937 2938 2939 2940 2941 2942
            len = qemu_get_be32(f);
            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
                error_report("Invalid compressed data length: %d", len);
                ret = -EINVAL;
                break;
            }
2943
            decompress_data_with_multi_threads(f, host, len);
2944
            break;
2945

2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958
        case RAM_SAVE_FLAG_XBZRLE:
            if (load_xbzrle(f, addr, host) < 0) {
                error_report("Failed to decompress XBZRLE page at "
                             RAM_ADDR_FMT, addr);
                ret = -EINVAL;
                break;
            }
            break;
        case RAM_SAVE_FLAG_EOS:
            /* normal exit */
            break;
        default:
            if (flags & RAM_SAVE_FLAG_HOOK) {
2959
                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970
            } else {
                error_report("Unknown combination of migration flags: %#x",
                             flags);
                ret = -EINVAL;
            }
        }
        if (!ret) {
            ret = qemu_file_get_error(f);
        }
    }

L
Liang Li 已提交
2971
    wait_for_decompress_done();
2972
    rcu_read_unlock();
2973
    trace_ram_load_complete(ret, seq_iter);
2974 2975 2976
    return ret;
}

2977 2978 2979 2980 2981
static bool ram_has_postcopy(void *opaque)
{
    return migrate_postcopy_ram();
}

2982
static SaveVMHandlers savevm_ram_handlers = {
2983
    .save_setup = ram_save_setup,
2984
    .save_live_iterate = ram_save_iterate,
2985
    .save_live_complete_postcopy = ram_save_complete,
2986
    .save_live_complete_precopy = ram_save_complete,
2987
    .has_postcopy = ram_has_postcopy,
2988 2989
    .save_live_pending = ram_save_pending,
    .load_state = ram_load,
2990 2991 2992
    .save_cleanup = ram_save_cleanup,
    .load_setup = ram_load_setup,
    .load_cleanup = ram_load_cleanup,
2993 2994 2995 2996 2997
};

void ram_mig_init(void)
{
    qemu_mutex_init(&XBZRLE.lock);
J
Juan Quintela 已提交
2998
    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2999
}