ram.c 131.8 KB
Newer Older
1 2 3 4
/*
 * QEMU System Emulator
 *
 * Copyright (c) 2003-2008 Fabrice Bellard
5 6 7 8
 * Copyright (c) 2011-2015 Red Hat Inc
 *
 * Authors:
 *  Juan Quintela <quintela@redhat.com>
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
28

P
Peter Maydell 已提交
29
#include "qemu/osdep.h"
30
#include "cpu.h"
31
#include <zlib.h>
32
#include "qemu/cutils.h"
33 34
#include "qemu/bitops.h"
#include "qemu/bitmap.h"
J
Juan Quintela 已提交
35
#include "qemu/main-loop.h"
36
#include "qemu/pmem.h"
37
#include "xbzrle.h"
38
#include "ram.h"
39
#include "migration.h"
40
#include "socket.h"
41
#include "migration/register.h"
42
#include "migration/misc.h"
J
Juan Quintela 已提交
43
#include "qemu-file.h"
44
#include "postcopy-ram.h"
45
#include "page_cache.h"
46
#include "qemu/error-report.h"
47
#include "qapi/error.h"
48
#include "qapi/qapi-events-migration.h"
49
#include "qapi/qmp/qerror.h"
50 51
#include "trace.h"
#include "exec/ram_addr.h"
52
#include "exec/target_page.h"
53
#include "qemu/rcu_queue.h"
54
#include "migration/colo.h"
55
#include "block.h"
56 57
#include "sysemu/sysemu.h"
#include "qemu/uuid.h"
58
#include "savevm.h"
59
#include "qemu/iov.h"
60 61 62 63

/***********************************************************/
/* ram save/restore */

64 65 66 67 68 69
/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
 * worked for pages that where filled with the same char.  We switched
 * it to only search for the zero value.  And to avoid confusion with
 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
 */

70
#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
71
#define RAM_SAVE_FLAG_ZERO     0x02
72 73 74 75 76 77 78 79 80 81
#define RAM_SAVE_FLAG_MEM_SIZE 0x04
#define RAM_SAVE_FLAG_PAGE     0x08
#define RAM_SAVE_FLAG_EOS      0x10
#define RAM_SAVE_FLAG_CONTINUE 0x20
#define RAM_SAVE_FLAG_XBZRLE   0x40
/* 0x80 is reserved in migration.h start with 0x100 next */
#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100

static inline bool is_zero_range(uint8_t *p, uint64_t size)
{
82
    return buffer_is_zero(p, size);
83 84
}

85 86
XBZRLECacheStats xbzrle_counters;

87 88 89 90 91 92 93 94 95 96
/* struct contains XBZRLE cache and a static page
   used by the compression */
static struct {
    /* buffer used for XBZRLE encoding */
    uint8_t *encoded_buf;
    /* buffer for storing page content */
    uint8_t *current_buf;
    /* Cache for XBZRLE, Protected by lock. */
    PageCache *cache;
    QemuMutex lock;
97 98
    /* it will store a page full of zeros */
    uint8_t *zero_target_page;
99 100
    /* buffer used for XBZRLE decoding */
    uint8_t *decoded_buf;
101 102 103 104 105 106 107 108 109 110 111 112 113 114
} XBZRLE;

static void XBZRLE_cache_lock(void)
{
    if (migrate_use_xbzrle())
        qemu_mutex_lock(&XBZRLE.lock);
}

static void XBZRLE_cache_unlock(void)
{
    if (migrate_use_xbzrle())
        qemu_mutex_unlock(&XBZRLE.lock);
}

115 116 117 118 119 120 121 122
/**
 * xbzrle_cache_resize: resize the xbzrle cache
 *
 * This function is called from qmp_migrate_set_cache_size in main
 * thread, possibly while a migration is in progress.  A running
 * migration may be using the cache and might finish during this call,
 * hence changes to the cache are protected by XBZRLE.lock().
 *
123
 * Returns 0 for success or -1 for error
124 125
 *
 * @new_size: new cache size
126
 * @errp: set *errp if the check failed, with reason
127
 */
128
int xbzrle_cache_resize(int64_t new_size, Error **errp)
129 130
{
    PageCache *new_cache;
131
    int64_t ret = 0;
132

133 134 135 136 137 138 139
    /* Check for truncation */
    if (new_size != (size_t)new_size) {
        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
                   "exceeding address space");
        return -1;
    }

140 141
    if (new_size == migrate_xbzrle_cache_size()) {
        /* nothing to do */
142
        return 0;
143 144
    }

145 146 147
    XBZRLE_cache_lock();

    if (XBZRLE.cache != NULL) {
148
        new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
149 150 151 152 153 154 155 156 157 158 159 160 161
        if (!new_cache) {
            ret = -1;
            goto out;
        }

        cache_fini(XBZRLE.cache);
        XBZRLE.cache = new_cache;
    }
out:
    XBZRLE_cache_unlock();
    return ret;
}

162 163 164 165 166 167
static bool ramblock_is_ignored(RAMBlock *block)
{
    return !qemu_ram_is_migratable(block) ||
           (migrate_ignore_shared() && qemu_ram_is_shared(block));
}

168
/* Should be holding either ram_list.mutex, or the RCU lock. */
169 170 171 172
#define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
    INTERNAL_RAMBLOCK_FOREACH(block)                   \
        if (ramblock_is_ignored(block)) {} else

173
#define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
174
    INTERNAL_RAMBLOCK_FOREACH(block)                   \
175 176
        if (!qemu_ram_is_migratable(block)) {} else

177 178
#undef RAMBLOCK_FOREACH

179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
{
    RAMBlock *block;
    int ret = 0;

    rcu_read_lock();
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
        ret = func(block, opaque);
        if (ret) {
            break;
        }
    }
    rcu_read_unlock();
    return ret;
}

195 196 197 198
static void ramblock_recv_map_init(void)
{
    RAMBlock *rb;

199
    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
200 201 202 203 204 205 206 207 208 209 210
        assert(!rb->receivedmap);
        rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
    }
}

int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
{
    return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
                    rb->receivedmap);
}

211 212 213 214 215
bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
{
    return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
}

216 217 218 219 220 221 222 223 224 225 226 227 228
void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
{
    set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
}

void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
                                    size_t nr)
{
    bitmap_set_atomic(rb->receivedmap,
                      ramblock_recv_bitmap_offset(host_addr, rb),
                      nr);
}

229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
#define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)

/*
 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 *
 * Returns >0 if success with sent bytes, or <0 if error.
 */
int64_t ramblock_recv_bitmap_send(QEMUFile *file,
                                  const char *block_name)
{
    RAMBlock *block = qemu_ram_block_by_name(block_name);
    unsigned long *le_bitmap, nbits;
    uint64_t size;

    if (!block) {
        error_report("%s: invalid block name: %s", __func__, block_name);
        return -1;
    }

    nbits = block->used_length >> TARGET_PAGE_BITS;

    /*
     * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
     * machines we may need 4 more bytes for padding (see below
     * comment). So extend it a bit before hand.
     */
    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);

    /*
     * Always use little endian when sending the bitmap. This is
     * required that when source and destination VMs are not using the
     * same endianess. (Note: big endian won't work.)
     */
    bitmap_to_le(le_bitmap, block->receivedmap, nbits);

    /* Size of the bitmap, in bytes */
265
    size = DIV_ROUND_UP(nbits, 8);
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283

    /*
     * size is always aligned to 8 bytes for 64bit machines, but it
     * may not be true for 32bit machines. We need this padding to
     * make sure the migration can survive even between 32bit and
     * 64bit machines.
     */
    size = ROUND_UP(size, 8);

    qemu_put_be64(file, size);
    qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
    /*
     * Mark as an end, in case the middle part is screwed up due to
     * some "misterious" reason.
     */
    qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
    qemu_fflush(file);

284
    g_free(le_bitmap);
285 286 287 288 289 290 291 292

    if (qemu_file_get_error(file)) {
        return qemu_file_get_error(file);
    }

    return size + sizeof(size);
}

293 294 295 296 297 298 299 300 301 302 303 304
/*
 * An outstanding page request, on the source, having been received
 * and queued
 */
struct RAMSrcPageRequest {
    RAMBlock *rb;
    hwaddr    offset;
    hwaddr    len;

    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
};

J
Juan Quintela 已提交
305 306
/* State of RAM for migration */
struct RAMState {
J
Juan Quintela 已提交
307 308
    /* QEMUFile used for this migration */
    QEMUFile *f;
J
Juan Quintela 已提交
309 310 311 312
    /* Last block that we have visited searching for dirty pages */
    RAMBlock *last_seen_block;
    /* Last block from where we have sent data */
    RAMBlock *last_sent_block;
313 314
    /* Last dirty target page we have sent */
    ram_addr_t last_page;
J
Juan Quintela 已提交
315 316 317 318
    /* last ram version we have seen */
    uint32_t last_version;
    /* We are in the first round */
    bool ram_bulk_stage;
319 320
    /* The free page optimization is enabled */
    bool fpo_enabled;
321 322
    /* How many times we have dirty too many pages */
    int dirty_rate_high_cnt;
323 324 325
    /* these variables are used for bitmap sync */
    /* last time we did a full bitmap_sync */
    int64_t time_last_bitmap_sync;
326
    /* bytes transferred at start_time */
327
    uint64_t bytes_xfer_prev;
328
    /* number of dirty pages since start_time */
329
    uint64_t num_dirty_pages_period;
330 331
    /* xbzrle misses since the beginning of the period */
    uint64_t xbzrle_cache_miss_prev;
332 333 334 335 336 337 338 339 340

    /* compression statistics since the beginning of the period */
    /* amount of count that no free thread to compress data */
    uint64_t compress_thread_busy_prev;
    /* amount bytes after compression */
    uint64_t compressed_size_prev;
    /* amount of compressed pages */
    uint64_t compress_pages_prev;

341 342 343 344
    /* total handled target pages at the beginning of period */
    uint64_t target_page_count_prev;
    /* total handled target pages since start */
    uint64_t target_page_count;
345
    /* number of dirty bits in the bitmap */
346
    uint64_t migration_dirty_pages;
347
    /* Protects modification of the bitmap and migration dirty pages */
348
    QemuMutex bitmap_mutex;
349 350
    /* The RAMBlock used in the last src_page_requests */
    RAMBlock *last_req_rb;
351 352
    /* Queue of outstanding page requests from the destination */
    QemuMutex src_page_req_mutex;
353
    QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
J
Juan Quintela 已提交
354 355 356
};
typedef struct RAMState RAMState;

J
Juan Quintela 已提交
357
static RAMState *ram_state;
J
Juan Quintela 已提交
358

359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
static NotifierWithReturnList precopy_notifier_list;

void precopy_infrastructure_init(void)
{
    notifier_with_return_list_init(&precopy_notifier_list);
}

void precopy_add_notifier(NotifierWithReturn *n)
{
    notifier_with_return_list_add(&precopy_notifier_list, n);
}

void precopy_remove_notifier(NotifierWithReturn *n)
{
    notifier_with_return_remove(n);
}

int precopy_notify(PrecopyNotifyReason reason, Error **errp)
{
    PrecopyNotifyData pnd;
    pnd.reason = reason;
    pnd.errp = errp;

    return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
}

385 386 387 388 389 390 391 392 393
void precopy_enable_free_page_optimization(void)
{
    if (!ram_state) {
        return;
    }

    ram_state->fpo_enabled = true;
}

J
Juan Quintela 已提交
394
uint64_t ram_bytes_remaining(void)
395
{
396 397
    return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
                       0;
398 399
}

400
MigrationStats ram_counters;
401

402 403 404 405
/* used by the search for pages to send */
struct PageSearchStatus {
    /* Current block being searched */
    RAMBlock    *block;
406 407
    /* Current page to search from */
    unsigned long page;
408 409 410 411 412
    /* Set once we wrap around */
    bool         complete_round;
};
typedef struct PageSearchStatus PageSearchStatus;

413 414
CompressionStats compression_counters;

415 416
struct CompressParam {
    bool done;
417
    bool quit;
418
    bool zero_page;
419 420 421 422 423
    QEMUFile *file;
    QemuMutex mutex;
    QemuCond cond;
    RAMBlock *block;
    ram_addr_t offset;
424 425

    /* internally used fields */
426
    z_stream stream;
427
    uint8_t *originbuf;
428 429 430 431
};
typedef struct CompressParam CompressParam;

struct DecompressParam {
432
    bool done;
433
    bool quit;
434 435 436
    QemuMutex mutex;
    QemuCond cond;
    void *des;
437
    uint8_t *compbuf;
438
    int len;
439
    z_stream stream;
440 441 442 443 444 445 446 447 448
};
typedef struct DecompressParam DecompressParam;

static CompressParam *comp_param;
static QemuThread *compress_threads;
/* comp_done_cond is used to wake up the migration thread when
 * one of the compression threads has finished the compression.
 * comp_done_lock is used to co-work with comp_done_cond.
 */
L
Liang Li 已提交
449 450
static QemuMutex comp_done_lock;
static QemuCond comp_done_cond;
451 452 453
/* The empty QEMUFileOps will be used by file in CompressParam */
static const QEMUFileOps empty_ops = { };

454
static QEMUFile *decomp_file;
455 456
static DecompressParam *decomp_param;
static QemuThread *decompress_threads;
457 458
static QemuMutex decomp_done_lock;
static QemuCond decomp_done_cond;
459

460
static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
461
                                 ram_addr_t offset, uint8_t *source_buf);
462 463 464 465

static void *do_data_compress(void *opaque)
{
    CompressParam *param = opaque;
466 467
    RAMBlock *block;
    ram_addr_t offset;
468
    bool zero_page;
469

470
    qemu_mutex_lock(&param->mutex);
471
    while (!param->quit) {
472 473 474 475 476 477
        if (param->block) {
            block = param->block;
            offset = param->offset;
            param->block = NULL;
            qemu_mutex_unlock(&param->mutex);

478 479
            zero_page = do_compress_ram_page(param->file, &param->stream,
                                             block, offset, param->originbuf);
480

L
Liang Li 已提交
481
            qemu_mutex_lock(&comp_done_lock);
482
            param->done = true;
483
            param->zero_page = zero_page;
L
Liang Li 已提交
484 485
            qemu_cond_signal(&comp_done_cond);
            qemu_mutex_unlock(&comp_done_lock);
486 487 488

            qemu_mutex_lock(&param->mutex);
        } else {
489 490 491
            qemu_cond_wait(&param->cond, &param->mutex);
        }
    }
492
    qemu_mutex_unlock(&param->mutex);
493 494 495 496

    return NULL;
}

497
static void compress_threads_save_cleanup(void)
498 499 500
{
    int i, thread_count;

F
Fei Li 已提交
501
    if (!migrate_use_compression() || !comp_param) {
502 503
        return;
    }
F
Fei Li 已提交
504

505 506
    thread_count = migrate_compress_threads();
    for (i = 0; i < thread_count; i++) {
507 508 509 510 511 512 513
        /*
         * we use it as a indicator which shows if the thread is
         * properly init'd or not
         */
        if (!comp_param[i].file) {
            break;
        }
F
Fei Li 已提交
514 515 516 517 518 519

        qemu_mutex_lock(&comp_param[i].mutex);
        comp_param[i].quit = true;
        qemu_cond_signal(&comp_param[i].cond);
        qemu_mutex_unlock(&comp_param[i].mutex);

520 521 522
        qemu_thread_join(compress_threads + i);
        qemu_mutex_destroy(&comp_param[i].mutex);
        qemu_cond_destroy(&comp_param[i].cond);
523
        deflateEnd(&comp_param[i].stream);
524
        g_free(comp_param[i].originbuf);
525 526
        qemu_fclose(comp_param[i].file);
        comp_param[i].file = NULL;
527
    }
L
Liang Li 已提交
528 529
    qemu_mutex_destroy(&comp_done_lock);
    qemu_cond_destroy(&comp_done_cond);
530 531 532 533 534 535
    g_free(compress_threads);
    g_free(comp_param);
    compress_threads = NULL;
    comp_param = NULL;
}

536
static int compress_threads_save_setup(void)
537 538 539 540
{
    int i, thread_count;

    if (!migrate_use_compression()) {
541
        return 0;
542 543 544 545
    }
    thread_count = migrate_compress_threads();
    compress_threads = g_new0(QemuThread, thread_count);
    comp_param = g_new0(CompressParam, thread_count);
L
Liang Li 已提交
546 547
    qemu_cond_init(&comp_done_cond);
    qemu_mutex_init(&comp_done_lock);
548
    for (i = 0; i < thread_count; i++) {
549 550 551 552 553
        comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
        if (!comp_param[i].originbuf) {
            goto exit;
        }

554 555
        if (deflateInit(&comp_param[i].stream,
                        migrate_compress_level()) != Z_OK) {
556
            g_free(comp_param[i].originbuf);
557 558 559
            goto exit;
        }

C
Cao jin 已提交
560 561
        /* comp_param[i].file is just used as a dummy buffer to save data,
         * set its ops to empty.
562 563 564
         */
        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
        comp_param[i].done = true;
565
        comp_param[i].quit = false;
566 567 568 569 570 571
        qemu_mutex_init(&comp_param[i].mutex);
        qemu_cond_init(&comp_param[i].cond);
        qemu_thread_create(compress_threads + i, "compress",
                           do_data_compress, comp_param + i,
                           QEMU_THREAD_JOINABLE);
    }
572 573 574 575 576
    return 0;

exit:
    compress_threads_save_cleanup();
    return -1;
577 578
}

579 580
/* Multiple fd's */

581 582 583
#define MULTIFD_MAGIC 0x11223344U
#define MULTIFD_VERSION 1

584 585
#define MULTIFD_FLAG_SYNC (1 << 0)

586 587 588 589 590 591 592
typedef struct {
    uint32_t magic;
    uint32_t version;
    unsigned char uuid[16]; /* QemuUUID */
    uint8_t id;
} __attribute__((packed)) MultiFDInit_t;

J
Juan Quintela 已提交
593 594 595 596 597 598 599 600 601 602 603
typedef struct {
    uint32_t magic;
    uint32_t version;
    uint32_t flags;
    uint32_t size;
    uint32_t used;
    uint64_t packet_num;
    char ramblock[256];
    uint64_t offset[];
} __attribute__((packed)) MultiFDPacket_t;

604 605 606 607 608 609 610 611 612 613 614 615 616 617
typedef struct {
    /* number of used pages */
    uint32_t used;
    /* number of allocated pages */
    uint32_t allocated;
    /* global number of generated multifd packets */
    uint64_t packet_num;
    /* offset of each page */
    ram_addr_t *offset;
    /* pointer to each page */
    struct iovec *iov;
    RAMBlock *block;
} MultiFDPages_t;

618 619 620
typedef struct {
    /* this fields are not changed once the thread is created */
    /* channel number */
621
    uint8_t id;
622
    /* channel thread name */
623
    char *name;
624
    /* channel thread id */
625
    QemuThread thread;
626
    /* communication channel */
627
    QIOChannel *c;
628
    /* sem where to wait for more work */
629
    QemuSemaphore sem;
630
    /* this mutex protects the following parameters */
631
    QemuMutex mutex;
632
    /* is this channel thread running */
633
    bool running;
634
    /* should this thread finish */
635
    bool quit;
636 637
    /* thread has work to do */
    int pending_job;
638 639
    /* array of pages to sent */
    MultiFDPages_t *pages;
J
Juan Quintela 已提交
640 641 642 643 644 645 646 647
    /* packet allocated len */
    uint32_t packet_len;
    /* pointer to the packet */
    MultiFDPacket_t *packet;
    /* multifd flags for each packet */
    uint32_t flags;
    /* global number of generated multifd packets */
    uint64_t packet_num;
648 649 650 651 652
    /* thread local variables */
    /* packets sent through this channel */
    uint64_t num_packets;
    /* pages sent through this channel */
    uint64_t num_pages;
653 654
    /* syncs main thread and channels */
    QemuSemaphore sem_sync;
655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670
}  MultiFDSendParams;

typedef struct {
    /* this fields are not changed once the thread is created */
    /* channel number */
    uint8_t id;
    /* channel thread name */
    char *name;
    /* channel thread id */
    QemuThread thread;
    /* communication channel */
    QIOChannel *c;
    /* this mutex protects the following parameters */
    QemuMutex mutex;
    /* is this channel thread running */
    bool running;
671 672
    /* array of pages to receive */
    MultiFDPages_t *pages;
J
Juan Quintela 已提交
673 674 675 676 677 678 679 680
    /* packet allocated len */
    uint32_t packet_len;
    /* pointer to the packet */
    MultiFDPacket_t *packet;
    /* multifd flags for each packet */
    uint32_t flags;
    /* global number of generated multifd packets */
    uint64_t packet_num;
681 682 683 684 685
    /* thread local variables */
    /* packets sent through this channel */
    uint64_t num_packets;
    /* pages sent through this channel */
    uint64_t num_pages;
686 687
    /* syncs main thread and channels */
    QemuSemaphore sem_sync;
688
} MultiFDRecvParams;
689

690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716
static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
{
    MultiFDInit_t msg;
    int ret;

    msg.magic = cpu_to_be32(MULTIFD_MAGIC);
    msg.version = cpu_to_be32(MULTIFD_VERSION);
    msg.id = p->id;
    memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));

    ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
    if (ret != 0) {
        return -1;
    }
    return 0;
}

static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
{
    MultiFDInit_t msg;
    int ret;

    ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
    if (ret != 0) {
        return -1;
    }

717 718
    msg.magic = be32_to_cpu(msg.magic);
    msg.version = be32_to_cpu(msg.version);
719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751

    if (msg.magic != MULTIFD_MAGIC) {
        error_setg(errp, "multifd: received packet magic %x "
                   "expected %x", msg.magic, MULTIFD_MAGIC);
        return -1;
    }

    if (msg.version != MULTIFD_VERSION) {
        error_setg(errp, "multifd: received packet version %d "
                   "expected %d", msg.version, MULTIFD_VERSION);
        return -1;
    }

    if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
        char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
        char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);

        error_setg(errp, "multifd: received uuid '%s' and expected "
                   "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
        g_free(uuid);
        g_free(msg_uuid);
        return -1;
    }

    if (msg.id > migrate_multifd_channels()) {
        error_setg(errp, "multifd: received channel version %d "
                   "expected %d", msg.version, MULTIFD_VERSION);
        return -1;
    }

    return msg.id;
}

752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
static MultiFDPages_t *multifd_pages_init(size_t size)
{
    MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);

    pages->allocated = size;
    pages->iov = g_new0(struct iovec, size);
    pages->offset = g_new0(ram_addr_t, size);

    return pages;
}

static void multifd_pages_clear(MultiFDPages_t *pages)
{
    pages->used = 0;
    pages->allocated = 0;
    pages->packet_num = 0;
    pages->block = NULL;
    g_free(pages->iov);
    pages->iov = NULL;
    g_free(pages->offset);
    pages->offset = NULL;
    g_free(pages);
}

J
Juan Quintela 已提交
776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802
static void multifd_send_fill_packet(MultiFDSendParams *p)
{
    MultiFDPacket_t *packet = p->packet;
    int i;

    packet->magic = cpu_to_be32(MULTIFD_MAGIC);
    packet->version = cpu_to_be32(MULTIFD_VERSION);
    packet->flags = cpu_to_be32(p->flags);
    packet->size = cpu_to_be32(migrate_multifd_page_count());
    packet->used = cpu_to_be32(p->pages->used);
    packet->packet_num = cpu_to_be64(p->packet_num);

    if (p->pages->block) {
        strncpy(packet->ramblock, p->pages->block->idstr, 256);
    }

    for (i = 0; i < p->pages->used; i++) {
        packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
    }
}

static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
{
    MultiFDPacket_t *packet = p->packet;
    RAMBlock *block;
    int i;

803
    packet->magic = be32_to_cpu(packet->magic);
J
Juan Quintela 已提交
804 805 806 807 808 809 810
    if (packet->magic != MULTIFD_MAGIC) {
        error_setg(errp, "multifd: received packet "
                   "magic %x and expected magic %x",
                   packet->magic, MULTIFD_MAGIC);
        return -1;
    }

811
    packet->version = be32_to_cpu(packet->version);
J
Juan Quintela 已提交
812 813 814 815 816 817 818 819 820
    if (packet->version != MULTIFD_VERSION) {
        error_setg(errp, "multifd: received packet "
                   "version %d and expected version %d",
                   packet->version, MULTIFD_VERSION);
        return -1;
    }

    p->flags = be32_to_cpu(packet->flags);

821
    packet->size = be32_to_cpu(packet->size);
J
Juan Quintela 已提交
822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
    if (packet->size > migrate_multifd_page_count()) {
        error_setg(errp, "multifd: received packet "
                   "with size %d and expected maximum size %d",
                   packet->size, migrate_multifd_page_count()) ;
        return -1;
    }

    p->pages->used = be32_to_cpu(packet->used);
    if (p->pages->used > packet->size) {
        error_setg(errp, "multifd: received packet "
                   "with size %d and expected maximum size %d",
                   p->pages->used, packet->size) ;
        return -1;
    }

    p->packet_num = be64_to_cpu(packet->packet_num);

    if (p->pages->used) {
        /* make sure that ramblock is 0 terminated */
        packet->ramblock[255] = 0;
        block = qemu_ram_block_by_name(packet->ramblock);
        if (!block) {
            error_setg(errp, "multifd: unknown ram block %s",
                       packet->ramblock);
            return -1;
        }
    }

    for (i = 0; i < p->pages->used; i++) {
        ram_addr_t offset = be64_to_cpu(packet->offset[i]);

        if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
            error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
                       " (max " RAM_ADDR_FMT ")",
                       offset, block->max_length);
            return -1;
        }
        p->pages->iov[i].iov_base = block->host + offset;
        p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
    }

    return 0;
}

866 867 868 869
struct {
    MultiFDSendParams *params;
    /* number of created threads */
    int count;
870 871
    /* array of pages to sent */
    MultiFDPages_t *pages;
872 873 874 875
    /* syncs main thread and channels */
    QemuSemaphore sem_sync;
    /* global number of generated multifd packets */
    uint64_t packet_num;
876 877
    /* send channels ready */
    QemuSemaphore channels_ready;
878 879
} *multifd_send_state;

880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923
/*
 * How we use multifd_send_state->pages and channel->pages?
 *
 * We create a pages for each channel, and a main one.  Each time that
 * we need to send a batch of pages we interchange the ones between
 * multifd_send_state and the channel that is sending it.  There are
 * two reasons for that:
 *    - to not have to do so many mallocs during migration
 *    - to make easier to know what to free at the end of migration
 *
 * This way we always know who is the owner of each "pages" struct,
 * and we don't need any loocking.  It belongs to the migration thread
 * or to the channel thread.  Switching is safe because the migration
 * thread is using the channel mutex when changing it, and the channel
 * have to had finish with its own, otherwise pending_job can't be
 * false.
 */

static void multifd_send_pages(void)
{
    int i;
    static int next_channel;
    MultiFDSendParams *p = NULL; /* make happy gcc */
    MultiFDPages_t *pages = multifd_send_state->pages;
    uint64_t transferred;

    qemu_sem_wait(&multifd_send_state->channels_ready);
    for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
        p = &multifd_send_state->params[i];

        qemu_mutex_lock(&p->mutex);
        if (!p->pending_job) {
            p->pending_job++;
            next_channel = (i + 1) % migrate_multifd_channels();
            break;
        }
        qemu_mutex_unlock(&p->mutex);
    }
    p->pages->used = 0;

    p->packet_num = multifd_send_state->packet_num++;
    p->pages->block = NULL;
    multifd_send_state->pages = p->pages;
    p->pages = pages;
924
    transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956
    ram_counters.multifd_bytes += transferred;
    ram_counters.transferred += transferred;;
    qemu_mutex_unlock(&p->mutex);
    qemu_sem_post(&p->sem);
}

static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
{
    MultiFDPages_t *pages = multifd_send_state->pages;

    if (!pages->block) {
        pages->block = block;
    }

    if (pages->block == block) {
        pages->offset[pages->used] = offset;
        pages->iov[pages->used].iov_base = block->host + offset;
        pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
        pages->used++;

        if (pages->used < pages->allocated) {
            return;
        }
    }

    multifd_send_pages();

    if (pages->block != block) {
        multifd_queue_page(block, offset);
    }
}

957
static void multifd_send_terminate_threads(Error *err)
958 959 960
{
    int i;

961 962 963 964 965 966 967 968 969 970 971 972
    if (err) {
        MigrationState *s = migrate_get_current();
        migrate_set_error(s, err);
        if (s->state == MIGRATION_STATUS_SETUP ||
            s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
            s->state == MIGRATION_STATUS_DEVICE ||
            s->state == MIGRATION_STATUS_ACTIVE) {
            migrate_set_state(&s->state, s->state,
                              MIGRATION_STATUS_FAILED);
        }
    }

973
    for (i = 0; i < migrate_multifd_channels(); i++) {
974 975 976 977 978 979 980 981 982
        MultiFDSendParams *p = &multifd_send_state->params[i];

        qemu_mutex_lock(&p->mutex);
        p->quit = true;
        qemu_sem_post(&p->sem);
        qemu_mutex_unlock(&p->mutex);
    }
}

983
void multifd_save_cleanup(void)
984 985 986 987
{
    int i;

    if (!migrate_use_multifd()) {
988
        return;
989
    }
990 991
    multifd_send_terminate_threads(NULL);
    for (i = 0; i < migrate_multifd_channels(); i++) {
992 993
        MultiFDSendParams *p = &multifd_send_state->params[i];

994 995 996
        if (p->running) {
            qemu_thread_join(&p->thread);
        }
997 998
        socket_send_channel_destroy(p->c);
        p->c = NULL;
999 1000
        qemu_mutex_destroy(&p->mutex);
        qemu_sem_destroy(&p->sem);
1001
        qemu_sem_destroy(&p->sem_sync);
1002 1003
        g_free(p->name);
        p->name = NULL;
1004 1005
        multifd_pages_clear(p->pages);
        p->pages = NULL;
J
Juan Quintela 已提交
1006 1007 1008
        p->packet_len = 0;
        g_free(p->packet);
        p->packet = NULL;
1009
    }
1010
    qemu_sem_destroy(&multifd_send_state->channels_ready);
1011
    qemu_sem_destroy(&multifd_send_state->sem_sync);
1012 1013
    g_free(multifd_send_state->params);
    multifd_send_state->params = NULL;
1014 1015
    multifd_pages_clear(multifd_send_state->pages);
    multifd_send_state->pages = NULL;
1016 1017 1018 1019
    g_free(multifd_send_state);
    multifd_send_state = NULL;
}

1020 1021 1022 1023 1024 1025 1026
static void multifd_send_sync_main(void)
{
    int i;

    if (!migrate_use_multifd()) {
        return;
    }
1027 1028 1029
    if (multifd_send_state->pages->used) {
        multifd_send_pages();
    }
1030 1031 1032 1033 1034 1035
    for (i = 0; i < migrate_multifd_channels(); i++) {
        MultiFDSendParams *p = &multifd_send_state->params[i];

        trace_multifd_send_sync_main_signal(p->id);

        qemu_mutex_lock(&p->mutex);
1036 1037

        p->packet_num = multifd_send_state->packet_num++;
1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051
        p->flags |= MULTIFD_FLAG_SYNC;
        p->pending_job++;
        qemu_mutex_unlock(&p->mutex);
        qemu_sem_post(&p->sem);
    }
    for (i = 0; i < migrate_multifd_channels(); i++) {
        MultiFDSendParams *p = &multifd_send_state->params[i];

        trace_multifd_send_sync_main_wait(p->id);
        qemu_sem_wait(&multifd_send_state->sem_sync);
    }
    trace_multifd_send_sync_main(multifd_send_state->packet_num);
}

1052 1053 1054
static void *multifd_send_thread(void *opaque)
{
    MultiFDSendParams *p = opaque;
1055
    Error *local_err = NULL;
1056
    int ret;
1057

1058
    trace_multifd_send_thread_start(p->id);
1059
    rcu_register_thread();
1060

1061 1062 1063
    if (multifd_send_initial_packet(p, &local_err) < 0) {
        goto out;
    }
1064 1065
    /* initial packet */
    p->num_packets = 1;
1066 1067

    while (true) {
1068
        qemu_sem_wait(&p->sem);
1069
        qemu_mutex_lock(&p->mutex);
1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084

        if (p->pending_job) {
            uint32_t used = p->pages->used;
            uint64_t packet_num = p->packet_num;
            uint32_t flags = p->flags;

            multifd_send_fill_packet(p);
            p->flags = 0;
            p->num_packets++;
            p->num_pages += used;
            p->pages->used = 0;
            qemu_mutex_unlock(&p->mutex);

            trace_multifd_send(p->id, packet_num, used, flags);

1085 1086 1087 1088 1089 1090
            ret = qio_channel_write_all(p->c, (void *)p->packet,
                                        p->packet_len, &local_err);
            if (ret != 0) {
                break;
            }

1091 1092 1093 1094 1095 1096
            if (used) {
                ret = qio_channel_writev_all(p->c, p->pages->iov,
                                             used, &local_err);
                if (ret != 0) {
                    break;
                }
1097
            }
1098 1099 1100 1101

            qemu_mutex_lock(&p->mutex);
            p->pending_job--;
            qemu_mutex_unlock(&p->mutex);
1102 1103 1104 1105

            if (flags & MULTIFD_FLAG_SYNC) {
                qemu_sem_post(&multifd_send_state->sem_sync);
            }
1106
            qemu_sem_post(&multifd_send_state->channels_ready);
1107
        } else if (p->quit) {
1108 1109
            qemu_mutex_unlock(&p->mutex);
            break;
1110 1111 1112
        } else {
            qemu_mutex_unlock(&p->mutex);
            /* sometimes there are spurious wakeups */
1113 1114 1115
        }
    }

1116 1117 1118 1119 1120
out:
    if (local_err) {
        multifd_send_terminate_threads(local_err);
    }

1121 1122 1123 1124
    qemu_mutex_lock(&p->mutex);
    p->running = false;
    qemu_mutex_unlock(&p->mutex);

1125
    rcu_unregister_thread();
1126 1127
    trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);

1128 1129 1130
    return NULL;
}

1131 1132 1133 1134 1135 1136 1137
static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
{
    MultiFDSendParams *p = opaque;
    QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
    Error *local_err = NULL;

    if (qio_task_propagate_error(task, &local_err)) {
1138 1139
        migrate_set_error(migrate_get_current(), local_err);
        multifd_save_cleanup();
1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150
    } else {
        p->c = QIO_CHANNEL(sioc);
        qio_channel_set_delay(p->c, false);
        p->running = true;
        qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
                           QEMU_THREAD_JOINABLE);

        atomic_inc(&multifd_send_state->count);
    }
}

1151 1152 1153
int multifd_save_setup(void)
{
    int thread_count;
1154
    uint32_t page_count = migrate_multifd_page_count();
1155 1156 1157 1158 1159 1160 1161 1162
    uint8_t i;

    if (!migrate_use_multifd()) {
        return 0;
    }
    thread_count = migrate_multifd_channels();
    multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
    multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1163
    atomic_set(&multifd_send_state->count, 0);
1164
    multifd_send_state->pages = multifd_pages_init(page_count);
1165
    qemu_sem_init(&multifd_send_state->sem_sync, 0);
1166
    qemu_sem_init(&multifd_send_state->channels_ready, 0);
1167

1168 1169 1170 1171 1172
    for (i = 0; i < thread_count; i++) {
        MultiFDSendParams *p = &multifd_send_state->params[i];

        qemu_mutex_init(&p->mutex);
        qemu_sem_init(&p->sem, 0);
1173
        qemu_sem_init(&p->sem_sync, 0);
1174
        p->quit = false;
1175
        p->pending_job = 0;
1176
        p->id = i;
1177
        p->pages = multifd_pages_init(page_count);
J
Juan Quintela 已提交
1178 1179 1180
        p->packet_len = sizeof(MultiFDPacket_t)
                      + sizeof(ram_addr_t) * page_count;
        p->packet = g_malloc0(p->packet_len);
1181
        p->name = g_strdup_printf("multifdsend_%d", i);
1182
        socket_send_channel_create(multifd_new_send_channel_async, p);
1183 1184 1185 1186 1187 1188 1189 1190
    }
    return 0;
}

struct {
    MultiFDRecvParams *params;
    /* number of created threads */
    int count;
1191 1192 1193 1194
    /* syncs main thread and channels */
    QemuSemaphore sem_sync;
    /* global number of generated multifd packets */
    uint64_t packet_num;
1195 1196
} *multifd_recv_state;

1197
static void multifd_recv_terminate_threads(Error *err)
1198 1199 1200
{
    int i;

1201 1202 1203 1204 1205 1206 1207 1208 1209 1210
    if (err) {
        MigrationState *s = migrate_get_current();
        migrate_set_error(s, err);
        if (s->state == MIGRATION_STATUS_SETUP ||
            s->state == MIGRATION_STATUS_ACTIVE) {
            migrate_set_state(&s->state, s->state,
                              MIGRATION_STATUS_FAILED);
        }
    }

1211
    for (i = 0; i < migrate_multifd_channels(); i++) {
1212 1213 1214
        MultiFDRecvParams *p = &multifd_recv_state->params[i];

        qemu_mutex_lock(&p->mutex);
1215 1216 1217 1218 1219
        /* We could arrive here for two reasons:
           - normal quit, i.e. everything went fine, just finished
           - error quit: We close the channels so the channel threads
             finish the qio_channel_read_all_eof() */
        qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231
        qemu_mutex_unlock(&p->mutex);
    }
}

int multifd_load_cleanup(Error **errp)
{
    int i;
    int ret = 0;

    if (!migrate_use_multifd()) {
        return 0;
    }
1232 1233
    multifd_recv_terminate_threads(NULL);
    for (i = 0; i < migrate_multifd_channels(); i++) {
1234 1235
        MultiFDRecvParams *p = &multifd_recv_state->params[i];

1236 1237 1238
        if (p->running) {
            qemu_thread_join(&p->thread);
        }
1239 1240
        object_unref(OBJECT(p->c));
        p->c = NULL;
1241
        qemu_mutex_destroy(&p->mutex);
1242
        qemu_sem_destroy(&p->sem_sync);
1243 1244
        g_free(p->name);
        p->name = NULL;
1245 1246
        multifd_pages_clear(p->pages);
        p->pages = NULL;
J
Juan Quintela 已提交
1247 1248 1249
        p->packet_len = 0;
        g_free(p->packet);
        p->packet = NULL;
1250
    }
1251
    qemu_sem_destroy(&multifd_recv_state->sem_sync);
1252 1253 1254 1255 1256 1257 1258 1259
    g_free(multifd_recv_state->params);
    multifd_recv_state->params = NULL;
    g_free(multifd_recv_state);
    multifd_recv_state = NULL;

    return ret;
}

1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286
static void multifd_recv_sync_main(void)
{
    int i;

    if (!migrate_use_multifd()) {
        return;
    }
    for (i = 0; i < migrate_multifd_channels(); i++) {
        MultiFDRecvParams *p = &multifd_recv_state->params[i];

        trace_multifd_recv_sync_main_wait(p->id);
        qemu_sem_wait(&multifd_recv_state->sem_sync);
        qemu_mutex_lock(&p->mutex);
        if (multifd_recv_state->packet_num < p->packet_num) {
            multifd_recv_state->packet_num = p->packet_num;
        }
        qemu_mutex_unlock(&p->mutex);
    }
    for (i = 0; i < migrate_multifd_channels(); i++) {
        MultiFDRecvParams *p = &multifd_recv_state->params[i];

        trace_multifd_recv_sync_main_signal(p->id);
        qemu_sem_post(&p->sem_sync);
    }
    trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
}

1287 1288 1289
static void *multifd_recv_thread(void *opaque)
{
    MultiFDRecvParams *p = opaque;
J
Juan Quintela 已提交
1290 1291
    Error *local_err = NULL;
    int ret;
1292

1293
    trace_multifd_recv_thread_start(p->id);
1294
    rcu_register_thread();
1295

1296
    while (true) {
1297 1298
        uint32_t used;
        uint32_t flags;
1299

1300 1301 1302 1303 1304 1305 1306 1307
        ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
                                       p->packet_len, &local_err);
        if (ret == 0) {   /* EOF */
            break;
        }
        if (ret == -1) {   /* Error */
            break;
        }
J
Juan Quintela 已提交
1308

1309 1310 1311
        qemu_mutex_lock(&p->mutex);
        ret = multifd_recv_unfill_packet(p, &local_err);
        if (ret) {
1312 1313 1314
            qemu_mutex_unlock(&p->mutex);
            break;
        }
1315 1316 1317 1318 1319 1320

        used = p->pages->used;
        flags = p->flags;
        trace_multifd_recv(p->id, p->packet_num, used, flags);
        p->num_packets++;
        p->num_pages += used;
1321
        qemu_mutex_unlock(&p->mutex);
1322

1323 1324 1325 1326 1327 1328
        if (used) {
            ret = qio_channel_readv_all(p->c, p->pages->iov,
                                        used, &local_err);
            if (ret != 0) {
                break;
            }
1329 1330
        }

1331 1332 1333 1334
        if (flags & MULTIFD_FLAG_SYNC) {
            qemu_sem_post(&multifd_recv_state->sem_sync);
            qemu_sem_wait(&p->sem_sync);
        }
1335 1336
    }

1337 1338 1339
    if (local_err) {
        multifd_recv_terminate_threads(local_err);
    }
1340 1341 1342 1343
    qemu_mutex_lock(&p->mutex);
    p->running = false;
    qemu_mutex_unlock(&p->mutex);

1344
    rcu_unregister_thread();
1345 1346
    trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);

1347 1348 1349 1350 1351 1352
    return NULL;
}

int multifd_load_setup(void)
{
    int thread_count;
1353
    uint32_t page_count = migrate_multifd_page_count();
1354 1355 1356 1357 1358 1359 1360 1361
    uint8_t i;

    if (!migrate_use_multifd()) {
        return 0;
    }
    thread_count = migrate_multifd_channels();
    multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
    multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1362
    atomic_set(&multifd_recv_state->count, 0);
1363
    qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1364

1365 1366 1367 1368
    for (i = 0; i < thread_count; i++) {
        MultiFDRecvParams *p = &multifd_recv_state->params[i];

        qemu_mutex_init(&p->mutex);
1369
        qemu_sem_init(&p->sem_sync, 0);
1370
        p->id = i;
1371
        p->pages = multifd_pages_init(page_count);
J
Juan Quintela 已提交
1372 1373 1374
        p->packet_len = sizeof(MultiFDPacket_t)
                      + sizeof(ram_addr_t) * page_count;
        p->packet = g_malloc0(p->packet_len);
1375 1376 1377 1378 1379
        p->name = g_strdup_printf("multifdrecv_%d", i);
    }
    return 0;
}

1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390
bool multifd_recv_all_channels_created(void)
{
    int thread_count = migrate_multifd_channels();

    if (!migrate_use_multifd()) {
        return true;
    }

    return thread_count == atomic_read(&multifd_recv_state->count);
}

1391 1392 1393 1394 1395 1396 1397
/*
 * Try to receive all multifd channels to get ready for the migration.
 * - Return true and do not set @errp when correctly receving all channels;
 * - Return false and do not set @errp when correctly receiving the current one;
 * - Return false and set @errp when failing to receive the current channel.
 */
bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1398
{
1399
    MultiFDRecvParams *p;
1400 1401
    Error *local_err = NULL;
    int id;
1402

1403 1404 1405
    id = multifd_recv_initial_packet(ioc, &local_err);
    if (id < 0) {
        multifd_recv_terminate_threads(local_err);
1406 1407 1408 1409
        error_propagate_prepend(errp, local_err,
                                "failed to receive packet"
                                " via multifd channel %d: ",
                                atomic_read(&multifd_recv_state->count));
1410
        return false;
1411 1412 1413 1414 1415 1416 1417
    }

    p = &multifd_recv_state->params[id];
    if (p->c != NULL) {
        error_setg(&local_err, "multifd: received id '%d' already setup'",
                   id);
        multifd_recv_terminate_threads(local_err);
1418
        error_propagate(errp, local_err);
1419
        return false;
1420
    }
1421 1422
    p->c = ioc;
    object_ref(OBJECT(ioc));
1423 1424
    /* initial packet */
    p->num_packets = 1;
1425 1426 1427 1428 1429

    p->running = true;
    qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
                       QEMU_THREAD_JOINABLE);
    atomic_inc(&multifd_recv_state->count);
1430 1431
    return atomic_read(&multifd_recv_state->count) ==
           migrate_multifd_channels();
1432 1433
}

1434
/**
1435
 * save_page_header: write page header to wire
1436 1437 1438
 *
 * If this is the 1st block, it also writes the block identification
 *
1439
 * Returns the number of bytes written
1440 1441 1442 1443 1444 1445
 *
 * @f: QEMUFile where to send the data
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 *          in the lower bits, it contains flags
 */
1446 1447
static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
                               ram_addr_t offset)
1448
{
1449
    size_t size, len;
1450

J
Juan Quintela 已提交
1451 1452 1453
    if (block == rs->last_sent_block) {
        offset |= RAM_SAVE_FLAG_CONTINUE;
    }
1454
    qemu_put_be64(f, offset);
1455 1456 1457
    size = 8;

    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1458
        len = strlen(block->idstr);
1459 1460
        qemu_put_byte(f, len);
        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1461
        size += 1 + len;
J
Juan Quintela 已提交
1462
        rs->last_sent_block = block;
1463 1464 1465 1466
    }
    return size;
}

1467 1468 1469 1470 1471 1472 1473 1474
/**
 * mig_throttle_guest_down: throotle down the guest
 *
 * Reduce amount of guest cpu execution to hopefully slow down memory
 * writes. If guest dirty memory rate is reduced below the rate at
 * which we can transfer pages to the destination then we should be
 * able to complete migration. Some workloads dirty memory way too
 * fast and will not effectively converge, even with auto-converge.
1475 1476 1477 1478
 */
static void mig_throttle_guest_down(void)
{
    MigrationState *s = migrate_get_current();
1479 1480
    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
    uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1481
    int pct_max = s->parameters.max_cpu_throttle;
1482 1483 1484 1485 1486 1487

    /* We have not started throttling yet. Let's start it. */
    if (!cpu_throttle_active()) {
        cpu_throttle_set(pct_initial);
    } else {
        /* Throttling already on, just increase the rate */
1488 1489
        cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
                         pct_max));
1490 1491 1492
    }
}

1493 1494 1495
/**
 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 *
J
Juan Quintela 已提交
1496
 * @rs: current RAM state
1497 1498 1499
 * @current_addr: address for the zero page
 *
 * Update the xbzrle cache to reflect a page that's been sent as all 0.
1500 1501 1502
 * The important thing is that a stale (not-yet-0'd) page be replaced
 * by the new data.
 * As a bonus, if the page wasn't in the cache it gets added so that
1503
 * when a small write is made into the 0'd page it gets XBZRLE sent.
1504
 */
J
Juan Quintela 已提交
1505
static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1506
{
J
Juan Quintela 已提交
1507
    if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1508 1509 1510 1511 1512
        return;
    }

    /* We don't care if this fails to allocate a new cache page
     * as long as it updated an old one */
1513
    cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1514
                 ram_counters.dirty_sync_count);
1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525
}

#define ENCODING_FLAG_XBZRLE 0x1

/**
 * save_xbzrle_page: compress and send current page
 *
 * Returns: 1 means that we wrote the page
 *          0 means that page is identical to the one already sent
 *          -1 means that xbzrle would be longer than normal
 *
1526
 * @rs: current RAM state
1527 1528
 * @current_data: pointer to the address of the page contents
 * @current_addr: addr of the page
1529 1530 1531 1532
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 * @last_stage: if we are at the completion stage
 */
J
Juan Quintela 已提交
1533
static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1534
                            ram_addr_t current_addr, RAMBlock *block,
1535
                            ram_addr_t offset, bool last_stage)
1536 1537 1538 1539
{
    int encoded_len = 0, bytes_xbzrle;
    uint8_t *prev_cached_page;

1540 1541 1542
    if (!cache_is_cached(XBZRLE.cache, current_addr,
                         ram_counters.dirty_sync_count)) {
        xbzrle_counters.cache_miss++;
1543 1544
        if (!last_stage) {
            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1545
                             ram_counters.dirty_sync_count) == -1) {
1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565
                return -1;
            } else {
                /* update *current_data when the page has been
                   inserted into cache */
                *current_data = get_cached_data(XBZRLE.cache, current_addr);
            }
        }
        return -1;
    }

    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);

    /* save current buffer into memory */
    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);

    /* XBZRLE encoding (if there is no overflow) */
    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
                                       TARGET_PAGE_SIZE);
    if (encoded_len == 0) {
1566
        trace_save_xbzrle_page_skipping();
1567 1568
        return 0;
    } else if (encoded_len == -1) {
1569
        trace_save_xbzrle_page_overflow();
1570
        xbzrle_counters.overflow++;
1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584
        /* update data in the cache */
        if (!last_stage) {
            memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
            *current_data = prev_cached_page;
        }
        return -1;
    }

    /* we need to update the data in the cache, in order to get the same data */
    if (!last_stage) {
        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
    }

    /* Send XBZRLE based compressed page */
1585
    bytes_xbzrle = save_page_header(rs, rs->f, block,
J
Juan Quintela 已提交
1586 1587 1588 1589
                                    offset | RAM_SAVE_FLAG_XBZRLE);
    qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
    qemu_put_be16(rs->f, encoded_len);
    qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1590
    bytes_xbzrle += encoded_len + 1 + 2;
1591 1592 1593
    xbzrle_counters.pages++;
    xbzrle_counters.bytes += bytes_xbzrle;
    ram_counters.transferred += bytes_xbzrle;
1594 1595 1596 1597

    return 1;
}

1598 1599
/**
 * migration_bitmap_find_dirty: find the next dirty page from start
1600
 *
1601 1602 1603 1604
 * Called with rcu_read_lock() to protect migration_bitmap
 *
 * Returns the byte offset within memory region of the start of a dirty page
 *
J
Juan Quintela 已提交
1605
 * @rs: current RAM state
1606
 * @rb: RAMBlock where to search for dirty pages
1607
 * @start: page where we start the search
1608
 */
1609
static inline
1610
unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1611
                                          unsigned long start)
1612
{
1613 1614
    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
    unsigned long *bitmap = rb->bmap;
1615 1616
    unsigned long next;

1617
    if (ramblock_is_ignored(rb)) {
1618 1619 1620
        return size;
    }

1621 1622 1623 1624 1625
    /*
     * When the free page optimization is enabled, we need to check the bitmap
     * to send the non-free pages rather than all the pages in the bulk stage.
     */
    if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1626
        next = start + 1;
1627
    } else {
1628
        next = find_next_bit(bitmap, size, start);
1629 1630
    }

1631
    return next;
1632 1633
}

1634
static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1635 1636
                                                RAMBlock *rb,
                                                unsigned long page)
1637 1638 1639
{
    bool ret;

1640
    qemu_mutex_lock(&rs->bitmap_mutex);
1641
    ret = test_and_clear_bit(page, rb->bmap);
1642 1643

    if (ret) {
1644
        rs->migration_dirty_pages--;
1645
    }
1646 1647
    qemu_mutex_unlock(&rs->bitmap_mutex);

1648 1649 1650
    return ret;
}

1651 1652
static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
                                        ram_addr_t start, ram_addr_t length)
1653
{
1654
    rs->migration_dirty_pages +=
1655
        cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1656
                                              &rs->num_dirty_pages_period);
1657 1658
}

1659 1660 1661 1662 1663 1664 1665 1666
/**
 * ram_pagesize_summary: calculate all the pagesizes of a VM
 *
 * Returns a summary bitmap of the page sizes of all RAMBlocks
 *
 * For VMs with just normal pages this is equivalent to the host page
 * size. If it's got some huge pages then it's the OR of all the
 * different page sizes.
1667 1668 1669 1670 1671 1672
 */
uint64_t ram_pagesize_summary(void)
{
    RAMBlock *block;
    uint64_t summary = 0;

1673
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1674 1675 1676 1677 1678 1679
        summary |= block->page_size;
    }

    return summary;
}

1680 1681 1682 1683 1684 1685
uint64_t ram_get_total_transferred_pages(void)
{
    return  ram_counters.normal + ram_counters.duplicate +
                compression_counters.pages + xbzrle_counters.pages;
}

1686 1687
static void migration_update_rates(RAMState *rs, int64_t end_time)
{
1688
    uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1689
    double compressed_size;
1690 1691 1692 1693 1694

    /* calculate period counters */
    ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
                / (end_time - rs->time_last_bitmap_sync);

1695
    if (!page_count) {
1696 1697 1698 1699 1700
        return;
    }

    if (migrate_use_xbzrle()) {
        xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1701
            rs->xbzrle_cache_miss_prev) / page_count;
1702 1703
        rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
    }
1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723

    if (migrate_use_compression()) {
        compression_counters.busy_rate = (double)(compression_counters.busy -
            rs->compress_thread_busy_prev) / page_count;
        rs->compress_thread_busy_prev = compression_counters.busy;

        compressed_size = compression_counters.compressed_size -
                          rs->compressed_size_prev;
        if (compressed_size) {
            double uncompressed_size = (compression_counters.pages -
                                    rs->compress_pages_prev) * TARGET_PAGE_SIZE;

            /* Compression-Ratio = Uncompressed-size / Compressed-size */
            compression_counters.compression_rate =
                                        uncompressed_size / compressed_size;

            rs->compress_pages_prev = compression_counters.pages;
            rs->compressed_size_prev = compression_counters.compressed_size;
        }
    }
1724 1725
}

1726
static void migration_bitmap_sync(RAMState *rs)
1727 1728 1729
{
    RAMBlock *block;
    int64_t end_time;
1730
    uint64_t bytes_xfer_now;
1731

1732
    ram_counters.dirty_sync_count++;
1733

1734 1735
    if (!rs->time_last_bitmap_sync) {
        rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1736 1737 1738
    }

    trace_migration_bitmap_sync_start();
1739
    memory_global_dirty_log_sync();
1740

1741
    qemu_mutex_lock(&rs->bitmap_mutex);
1742
    rcu_read_lock();
1743
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1744
        migration_bitmap_sync_range(rs, block, 0, block->used_length);
1745
    }
1746
    ram_counters.remaining = ram_bytes_remaining();
1747
    rcu_read_unlock();
1748
    qemu_mutex_unlock(&rs->bitmap_mutex);
1749

1750
    trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1751

1752 1753 1754
    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);

    /* more than 1 second = 1000 millisecons */
1755
    if (end_time > rs->time_last_bitmap_sync + 1000) {
1756
        bytes_xfer_now = ram_counters.transferred;
1757

1758 1759 1760 1761
        /* During block migration the auto-converge logic incorrectly detects
         * that ram migration makes no progress. Avoid this by disabling the
         * throttling logic during the bulk phase of block migration. */
        if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1762 1763 1764
            /* The following detection logic can be refined later. For now:
               Check to see if the dirtied bytes is 50% more than the approx.
               amount of bytes that just got transferred since the last time we
1765 1766 1767
               were in this routine. If that happens twice, start or increase
               throttling */

1768
            if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1769
                   (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1770
                (++rs->dirty_rate_high_cnt >= 2)) {
1771
                    trace_migration_throttle();
1772
                    rs->dirty_rate_high_cnt = 0;
1773
                    mig_throttle_guest_down();
1774
            }
1775
        }
1776

1777 1778
        migration_update_rates(rs, end_time);

1779
        rs->target_page_count_prev = rs->target_page_count;
1780 1781

        /* reset period counters */
1782
        rs->time_last_bitmap_sync = end_time;
1783
        rs->num_dirty_pages_period = 0;
1784
        rs->bytes_xfer_prev = bytes_xfer_now;
1785
    }
1786
    if (migrate_use_events()) {
1787
        qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1788
    }
1789 1790
}

1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809
static void migration_bitmap_sync_precopy(RAMState *rs)
{
    Error *local_err = NULL;

    /*
     * The current notifier usage is just an optimization to migration, so we
     * don't stop the normal migration process in the error case.
     */
    if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
        error_report_err(local_err);
    }

    migration_bitmap_sync(rs);

    if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
        error_report_err(local_err);
    }
}

1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834
/**
 * save_zero_page_to_file: send the zero page to the file
 *
 * Returns the size of data written to the file, 0 means the page is not
 * a zero page
 *
 * @rs: current RAM state
 * @file: the file where the data is saved
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 */
static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
                                  RAMBlock *block, ram_addr_t offset)
{
    uint8_t *p = block->host + offset;
    int len = 0;

    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
        len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
        qemu_put_byte(file, 0);
        len += 1;
    }
    return len;
}

1835
/**
1836
 * save_zero_page: send the zero page to the stream
1837
 *
1838
 * Returns the number of pages written.
1839
 *
1840
 * @rs: current RAM state
1841 1842 1843
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 */
1844
static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1845
{
1846
    int len = save_zero_page_to_file(rs, rs->f, block, offset);
1847

1848
    if (len) {
1849
        ram_counters.duplicate++;
1850 1851
        ram_counters.transferred += len;
        return 1;
1852
    }
1853
    return -1;
1854 1855
}

1856
static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1857
{
1858
    if (!migrate_release_ram() || !migration_in_postcopy()) {
1859 1860 1861
        return;
    }

1862
    ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1863 1864
}

1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902
/*
 * @pages: the number of pages written by the control path,
 *        < 0 - error
 *        > 0 - number of pages written
 *
 * Return true if the pages has been saved, otherwise false is returned.
 */
static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
                              int *pages)
{
    uint64_t bytes_xmit = 0;
    int ret;

    *pages = -1;
    ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
                                &bytes_xmit);
    if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
        return false;
    }

    if (bytes_xmit) {
        ram_counters.transferred += bytes_xmit;
        *pages = 1;
    }

    if (ret == RAM_SAVE_CONTROL_DELAYED) {
        return true;
    }

    if (bytes_xmit > 0) {
        ram_counters.normal++;
    } else if (bytes_xmit == 0) {
        ram_counters.duplicate++;
    }

    return true;
}

1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930
/*
 * directly send the page to the stream
 *
 * Returns the number of pages written.
 *
 * @rs: current RAM state
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 * @buf: the page to be sent
 * @async: send to page asyncly
 */
static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
                            uint8_t *buf, bool async)
{
    ram_counters.transferred += save_page_header(rs, rs->f, block,
                                                 offset | RAM_SAVE_FLAG_PAGE);
    if (async) {
        qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
                              migrate_release_ram() &
                              migration_in_postcopy());
    } else {
        qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
    }
    ram_counters.transferred += TARGET_PAGE_SIZE;
    ram_counters.normal++;
    return 1;
}

1931
/**
1932
 * ram_save_page: send the given page to the stream
1933
 *
1934
 * Returns the number of pages written.
1935 1936 1937
 *          < 0 - error
 *          >=0 - Number of pages written - this might legally be 0
 *                if xbzrle noticed the page was the same.
1938
 *
J
Juan Quintela 已提交
1939
 * @rs: current RAM state
1940 1941 1942 1943
 * @block: block that contains the page we want to send
 * @offset: offset inside the block for the page
 * @last_stage: if we are at the completion stage
 */
1944
static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1945 1946 1947 1948
{
    int pages = -1;
    uint8_t *p;
    bool send_async = true;
1949
    RAMBlock *block = pss->block;
1950
    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1951
    ram_addr_t current_addr = block->offset + offset;
1952

1953
    p = block->host + offset;
1954
    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1955 1956

    XBZRLE_cache_lock();
1957 1958
    if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
        migrate_use_xbzrle()) {
1959 1960 1961 1962 1963
        pages = save_xbzrle_page(rs, &p, current_addr, block,
                                 offset, last_stage);
        if (!last_stage) {
            /* Can't send this cached data async, since the cache page
             * might get updated before it gets to the wire
1964
             */
1965
            send_async = false;
1966 1967 1968 1969 1970
        }
    }

    /* XBZRLE overflow or normal page */
    if (pages == -1) {
1971
        pages = save_normal_page(rs, block, offset, p, send_async);
1972 1973 1974 1975 1976 1977 1978
    }

    XBZRLE_cache_unlock();

    return pages;
}

1979 1980 1981 1982 1983 1984 1985 1986 1987
static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
                                 ram_addr_t offset)
{
    multifd_queue_page(block, offset);
    ram_counters.normal++;

    return 1;
}

1988
static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1989
                                 ram_addr_t offset, uint8_t *source_buf)
1990
{
J
Juan Quintela 已提交
1991
    RAMState *rs = ram_state;
1992
    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1993
    bool zero_page = false;
1994
    int ret;
1995

1996 1997 1998 1999 2000
    if (save_zero_page_to_file(rs, f, block, offset)) {
        zero_page = true;
        goto exit;
    }

2001
    save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2002 2003 2004 2005 2006 2007 2008

    /*
     * copy it to a internal buffer to avoid it being modified by VM
     * so that we can catch up the error during compression and
     * decompression
     */
    memcpy(source_buf, p, TARGET_PAGE_SIZE);
2009 2010 2011
    ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
    if (ret < 0) {
        qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2012
        error_report("compressed data failed!");
2013
        return false;
2014
    }
2015

2016
exit:
2017
    ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2018 2019 2020 2021 2022 2023
    return zero_page;
}

static void
update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
{
2024 2025
    ram_counters.transferred += bytes_xmit;

2026 2027
    if (param->zero_page) {
        ram_counters.duplicate++;
2028
        return;
2029
    }
2030 2031 2032 2033

    /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
    compression_counters.compressed_size += bytes_xmit - 8;
    compression_counters.pages++;
2034 2035
}

2036 2037
static bool save_page_use_compression(RAMState *rs);

J
Juan Quintela 已提交
2038
static void flush_compressed_data(RAMState *rs)
2039 2040 2041
{
    int idx, len, thread_count;

2042
    if (!save_page_use_compression(rs)) {
2043 2044 2045
        return;
    }
    thread_count = migrate_compress_threads();
2046

L
Liang Li 已提交
2047
    qemu_mutex_lock(&comp_done_lock);
2048
    for (idx = 0; idx < thread_count; idx++) {
2049
        while (!comp_param[idx].done) {
L
Liang Li 已提交
2050
            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2051
        }
2052
    }
L
Liang Li 已提交
2053
    qemu_mutex_unlock(&comp_done_lock);
2054 2055 2056

    for (idx = 0; idx < thread_count; idx++) {
        qemu_mutex_lock(&comp_param[idx].mutex);
2057
        if (!comp_param[idx].quit) {
J
Juan Quintela 已提交
2058
            len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2059 2060 2061 2062 2063 2064
            /*
             * it's safe to fetch zero_page without holding comp_done_lock
             * as there is no further request submitted to the thread,
             * i.e, the thread should be waiting for a request at this point.
             */
            update_compress_thread_counts(&comp_param[idx], len);
2065
        }
2066
        qemu_mutex_unlock(&comp_param[idx].mutex);
2067 2068 2069 2070 2071 2072 2073 2074 2075 2076
    }
}

static inline void set_compress_params(CompressParam *param, RAMBlock *block,
                                       ram_addr_t offset)
{
    param->block = block;
    param->offset = offset;
}

J
Juan Quintela 已提交
2077 2078
static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
                                           ram_addr_t offset)
2079 2080
{
    int idx, thread_count, bytes_xmit = -1, pages = -1;
2081
    bool wait = migrate_compress_wait_thread();
2082 2083

    thread_count = migrate_compress_threads();
L
Liang Li 已提交
2084
    qemu_mutex_lock(&comp_done_lock);
2085 2086 2087 2088 2089 2090 2091 2092 2093 2094
retry:
    for (idx = 0; idx < thread_count; idx++) {
        if (comp_param[idx].done) {
            comp_param[idx].done = false;
            bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
            qemu_mutex_lock(&comp_param[idx].mutex);
            set_compress_params(&comp_param[idx], block, offset);
            qemu_cond_signal(&comp_param[idx].cond);
            qemu_mutex_unlock(&comp_param[idx].mutex);
            pages = 1;
2095
            update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2096 2097 2098
            break;
        }
    }
2099 2100 2101 2102 2103 2104 2105 2106 2107

    /*
     * wait for the free thread if the user specifies 'compress-wait-thread',
     * otherwise we will post the page out in the main thread as normal page.
     */
    if (pages < 0 && wait) {
        qemu_cond_wait(&comp_done_cond, &comp_done_lock);
        goto retry;
    }
L
Liang Li 已提交
2108
    qemu_mutex_unlock(&comp_done_lock);
2109 2110 2111 2112

    return pages;
}

2113 2114 2115
/**
 * find_dirty_block: find the next dirty page and update any state
 * associated with the search process.
2116
 *
2117
 * Returns if a page is found
2118
 *
J
Juan Quintela 已提交
2119
 * @rs: current RAM state
2120 2121
 * @pss: data about the state of the current dirty page scan
 * @again: set to false if the search has scanned the whole of RAM
2122
 */
2123
static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2124
{
2125
    pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
J
Juan Quintela 已提交
2126
    if (pss->complete_round && pss->block == rs->last_seen_block &&
2127
        pss->page >= rs->last_page) {
2128 2129 2130 2131 2132 2133 2134
        /*
         * We've been once around the RAM and haven't found anything.
         * Give up.
         */
        *again = false;
        return false;
    }
2135
    if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2136
        /* Didn't find anything in this RAM Block */
2137
        pss->page = 0;
2138 2139
        pss->block = QLIST_NEXT_RCU(pss->block, next);
        if (!pss->block) {
2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150
            /*
             * If memory migration starts over, we will meet a dirtied page
             * which may still exists in compression threads's ring, so we
             * should flush the compressed data to make sure the new page
             * is not overwritten by the old one in the destination.
             *
             * Also If xbzrle is on, stop using the data compression at this
             * point. In theory, xbzrle can do better than compression.
             */
            flush_compressed_data(rs);

2151 2152 2153 2154
            /* Hit the end of the list */
            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
            /* Flag that we've looped */
            pss->complete_round = true;
J
Juan Quintela 已提交
2155
            rs->ram_bulk_stage = false;
2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167
        }
        /* Didn't find anything this time, but try again on the new block */
        *again = true;
        return false;
    } else {
        /* Can go around again, but... */
        *again = true;
        /* We've found something so probably don't need to */
        return true;
    }
}

2168 2169 2170
/**
 * unqueue_page: gets a page of the queue
 *
2171 2172
 * Helper for 'get_queued_page' - gets a page off the queue
 *
2173 2174
 * Returns the block of the page (or NULL if none available)
 *
2175
 * @rs: current RAM state
2176
 * @offset: used to return the offset within the RAMBlock
2177
 */
2178
static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2179 2180 2181
{
    RAMBlock *block = NULL;

2182 2183 2184 2185
    if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
        return NULL;
    }

2186 2187 2188 2189
    qemu_mutex_lock(&rs->src_page_req_mutex);
    if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
        struct RAMSrcPageRequest *entry =
                                QSIMPLEQ_FIRST(&rs->src_page_requests);
2190 2191 2192 2193 2194 2195 2196 2197
        block = entry->rb;
        *offset = entry->offset;

        if (entry->len > TARGET_PAGE_SIZE) {
            entry->len -= TARGET_PAGE_SIZE;
            entry->offset += TARGET_PAGE_SIZE;
        } else {
            memory_region_unref(block->mr);
2198
            QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2199
            g_free(entry);
2200
            migration_consume_urgent_request();
2201 2202
        }
    }
2203
    qemu_mutex_unlock(&rs->src_page_req_mutex);
2204 2205 2206 2207

    return block;
}

2208 2209 2210 2211
/**
 * get_queued_page: unqueue a page from the postocpy requests
 *
 * Skips pages that are already sent (!dirty)
2212
 *
2213
 * Returns if a queued page is found
2214
 *
J
Juan Quintela 已提交
2215
 * @rs: current RAM state
2216
 * @pss: data about the state of the current dirty page scan
2217
 */
2218
static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2219 2220 2221 2222 2223 2224
{
    RAMBlock  *block;
    ram_addr_t offset;
    bool dirty;

    do {
2225
        block = unqueue_page(rs, &offset);
2226 2227 2228 2229 2230 2231 2232
        /*
         * We're sending this page, and since it's postcopy nothing else
         * will dirty it, and we must make sure it doesn't get sent again
         * even if this queue request was received after the background
         * search already sent it.
         */
        if (block) {
2233 2234
            unsigned long page;

2235 2236
            page = offset >> TARGET_PAGE_BITS;
            dirty = test_bit(page, block->bmap);
2237
            if (!dirty) {
2238
                trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2239
                       page, test_bit(page, block->unsentmap));
2240
            } else {
2241
                trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253
            }
        }

    } while (block && !dirty);

    if (block) {
        /*
         * As soon as we start servicing pages out of order, then we have
         * to kill the bulk stage, since the bulk stage assumes
         * in (migration_bitmap_find_and_reset_dirty) that every page is
         * dirty, that's no longer true.
         */
J
Juan Quintela 已提交
2254
        rs->ram_bulk_stage = false;
2255 2256 2257 2258 2259 2260 2261

        /*
         * We want the background search to continue from the queued page
         * since the guest is likely to want other pages near to the page
         * it just requested.
         */
        pss->block = block;
2262
        pss->page = offset >> TARGET_PAGE_BITS;
2263 2264 2265 2266 2267
    }

    return !!block;
}

2268
/**
2269 2270
 * migration_page_queue_free: drop any remaining pages in the ram
 * request queue
2271
 *
2272 2273 2274
 * It should be empty at the end anyway, but in error cases there may
 * be some left.  in case that there is any page left, we drop it.
 *
2275
 */
2276
static void migration_page_queue_free(RAMState *rs)
2277
{
2278
    struct RAMSrcPageRequest *mspr, *next_mspr;
2279 2280 2281 2282
    /* This queue generally should be empty - but in the case of a failed
     * migration might have some droppings in.
     */
    rcu_read_lock();
2283
    QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2284
        memory_region_unref(mspr->rb->mr);
2285
        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2286 2287 2288 2289 2290 2291
        g_free(mspr);
    }
    rcu_read_unlock();
}

/**
2292 2293 2294 2295 2296 2297 2298 2299 2300 2301
 * ram_save_queue_pages: queue the page for transmission
 *
 * A request from postcopy destination for example.
 *
 * Returns zero on success or negative on error
 *
 * @rbname: Name of the RAMBLock of the request. NULL means the
 *          same that last one.
 * @start: starting address from the start of the RAMBlock
 * @len: length (in bytes) to send
2302
 */
2303
int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2304 2305
{
    RAMBlock *ramblock;
J
Juan Quintela 已提交
2306
    RAMState *rs = ram_state;
2307

2308
    ram_counters.postcopy_requests++;
2309 2310 2311
    rcu_read_lock();
    if (!rbname) {
        /* Reuse last RAMBlock */
2312
        ramblock = rs->last_req_rb;
2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329

        if (!ramblock) {
            /*
             * Shouldn't happen, we can't reuse the last RAMBlock if
             * it's the 1st request.
             */
            error_report("ram_save_queue_pages no previous block");
            goto err;
        }
    } else {
        ramblock = qemu_ram_block_by_name(rbname);

        if (!ramblock) {
            /* We shouldn't be asked for a non-existent RAMBlock */
            error_report("ram_save_queue_pages no block '%s'", rbname);
            goto err;
        }
2330
        rs->last_req_rb = ramblock;
2331 2332 2333
    }
    trace_ram_save_queue_pages(ramblock->idstr, start, len);
    if (start+len > ramblock->used_length) {
2334 2335
        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2336 2337 2338 2339
                     __func__, start, len, ramblock->used_length);
        goto err;
    }

2340 2341
    struct RAMSrcPageRequest *new_entry =
        g_malloc0(sizeof(struct RAMSrcPageRequest));
2342 2343 2344 2345 2346
    new_entry->rb = ramblock;
    new_entry->offset = start;
    new_entry->len = len;

    memory_region_ref(ramblock->mr);
2347 2348
    qemu_mutex_lock(&rs->src_page_req_mutex);
    QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2349
    migration_make_urgent_request();
2350
    qemu_mutex_unlock(&rs->src_page_req_mutex);
2351 2352 2353 2354 2355 2356 2357 2358 2359
    rcu_read_unlock();

    return 0;

err:
    rcu_read_unlock();
    return -1;
}

2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377
static bool save_page_use_compression(RAMState *rs)
{
    if (!migrate_use_compression()) {
        return false;
    }

    /*
     * If xbzrle is on, stop using the data compression after first
     * round of migration even if compression is enabled. In theory,
     * xbzrle can do better than compression.
     */
    if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
        return true;
    }

    return false;
}

2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407
/*
 * try to compress the page before posting it out, return true if the page
 * has been properly handled by compression, otherwise needs other
 * paths to handle it
 */
static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
{
    if (!save_page_use_compression(rs)) {
        return false;
    }

    /*
     * When starting the process of a new block, the first page of
     * the block should be sent out before other pages in the same
     * block, and all the pages in last block should have been sent
     * out, keeping this order is important, because the 'cont' flag
     * is used to avoid resending the block name.
     *
     * We post the fist page as normal page as compression will take
     * much CPU resource.
     */
    if (block != rs->last_sent_block) {
        flush_compressed_data(rs);
        return false;
    }

    if (compress_page_with_multi_thread(rs, block, offset) > 0) {
        return true;
    }

2408
    compression_counters.busy++;
2409 2410 2411
    return false;
}

2412
/**
2413
 * ram_save_target_page: save one target page
2414
 *
2415
 * Returns the number of pages written
2416
 *
J
Juan Quintela 已提交
2417
 * @rs: current RAM state
2418
 * @pss: data about the page we want to send
2419 2420
 * @last_stage: if we are at the completion stage
 */
2421
static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2422
                                bool last_stage)
2423
{
2424 2425 2426 2427 2428 2429 2430 2431
    RAMBlock *block = pss->block;
    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
    int res;

    if (control_save_page(rs, block, offset, &res)) {
        return res;
    }

2432 2433
    if (save_compress_page(rs, block, offset)) {
        return 1;
2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449
    }

    res = save_zero_page(rs, block, offset);
    if (res > 0) {
        /* Must let xbzrle know, otherwise a previous (now 0'd) cached
         * page would be stale
         */
        if (!save_page_use_compression(rs)) {
            XBZRLE_cache_lock();
            xbzrle_cache_zero_page(rs, block->offset + offset);
            XBZRLE_cache_unlock();
        }
        ram_release_pages(block->idstr, offset, res);
        return res;
    }

2450
    /*
2451 2452
     * do not use multifd for compression as the first page in the new
     * block should be posted out before sending the compressed page
2453
     */
2454
    if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2455
        return ram_save_multifd_page(rs, block, offset);
2456 2457
    }

2458
    return ram_save_page(rs, pss, last_stage);
2459 2460 2461
}

/**
2462
 * ram_save_host_page: save a whole host page
2463
 *
2464 2465 2466 2467 2468
 * Starting at *offset send pages up to the end of the current host
 * page. It's valid for the initial offset to point into the middle of
 * a host page in which case the remainder of the hostpage is sent.
 * Only dirty target pages are sent. Note that the host page size may
 * be a huge page for this block.
2469 2470
 * The saving stops at the boundary of the used_length of the block
 * if the RAMBlock isn't a multiple of the host page size.
2471
 *
2472 2473
 * Returns the number of pages written or negative on error
 *
J
Juan Quintela 已提交
2474
 * @rs: current RAM state
2475 2476
 * @ms: current migration state
 * @pss: data about the page we want to send
2477 2478
 * @last_stage: if we are at the completion stage
 */
2479
static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2480
                              bool last_stage)
2481 2482
{
    int tmppages, pages = 0;
2483 2484
    size_t pagesize_bits =
        qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2485

2486
    if (ramblock_is_ignored(pss->block)) {
2487 2488 2489 2490
        error_report("block %s should not be migrated !", pss->block->idstr);
        return 0;
    }

2491
    do {
2492 2493 2494 2495 2496 2497
        /* Check the pages is dirty and if it is send it */
        if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
            pss->page++;
            continue;
        }

2498
        tmppages = ram_save_target_page(rs, pss, last_stage);
2499 2500 2501 2502 2503
        if (tmppages < 0) {
            return tmppages;
        }

        pages += tmppages;
2504 2505 2506 2507
        if (pss->block->unsentmap) {
            clear_bit(pss->page, pss->block->unsentmap);
        }

2508
        pss->page++;
2509 2510
    } while ((pss->page & (pagesize_bits - 1)) &&
             offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2511 2512

    /* The offset we leave with is the last one we looked at */
2513
    pss->page--;
2514 2515
    return pages;
}
2516

2517
/**
2518
 * ram_find_and_save_block: finds a dirty page and sends it to f
2519 2520 2521
 *
 * Called within an RCU critical section.
 *
2522 2523
 * Returns the number of pages written where zero means no dirty pages,
 * or negative on error
2524
 *
J
Juan Quintela 已提交
2525
 * @rs: current RAM state
2526
 * @last_stage: if we are at the completion stage
2527 2528 2529
 *
 * On systems where host-page-size > target-page-size it will send all the
 * pages in a host page that are dirty.
2530 2531
 */

J
Juan Quintela 已提交
2532
static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2533
{
2534
    PageSearchStatus pss;
2535
    int pages = 0;
2536
    bool again, found;
2537

2538 2539 2540 2541 2542
    /* No dirty page as there is zero RAM */
    if (!ram_bytes_total()) {
        return pages;
    }

J
Juan Quintela 已提交
2543
    pss.block = rs->last_seen_block;
2544
    pss.page = rs->last_page;
2545 2546 2547 2548 2549
    pss.complete_round = false;

    if (!pss.block) {
        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
    }
2550

2551
    do {
2552
        again = true;
2553
        found = get_queued_page(rs, &pss);
2554

2555 2556
        if (!found) {
            /* priority queue empty, so just search for something dirty */
2557
            found = find_dirty_block(rs, &pss, &again);
2558
        }
2559

2560
        if (found) {
2561
            pages = ram_save_host_page(rs, &pss, last_stage);
2562
        }
2563
    } while (!pages && again);
2564

J
Juan Quintela 已提交
2565
    rs->last_seen_block = pss.block;
2566
    rs->last_page = pss.page;
2567 2568 2569 2570 2571 2572 2573

    return pages;
}

void acct_update_position(QEMUFile *f, size_t size, bool zero)
{
    uint64_t pages = size / TARGET_PAGE_SIZE;
2574

2575
    if (zero) {
2576
        ram_counters.duplicate += pages;
2577
    } else {
2578 2579
        ram_counters.normal += pages;
        ram_counters.transferred += size;
2580 2581 2582 2583
        qemu_update_position(f, size);
    }
}

2584
static uint64_t ram_bytes_total_common(bool count_ignored)
2585 2586 2587 2588 2589
{
    RAMBlock *block;
    uint64_t total = 0;

    rcu_read_lock();
2590 2591 2592 2593 2594 2595 2596 2597
    if (count_ignored) {
        RAMBLOCK_FOREACH_MIGRATABLE(block) {
            total += block->used_length;
        }
    } else {
        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
            total += block->used_length;
        }
P
Peter Xu 已提交
2598
    }
2599 2600 2601 2602
    rcu_read_unlock();
    return total;
}

2603 2604 2605 2606 2607
uint64_t ram_bytes_total(void)
{
    return ram_bytes_total_common(false);
}

2608
static void xbzrle_load_setup(void)
2609
{
2610
    XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2611 2612
}

2613 2614 2615 2616 2617 2618
static void xbzrle_load_cleanup(void)
{
    g_free(XBZRLE.decoded_buf);
    XBZRLE.decoded_buf = NULL;
}

P
Peter Xu 已提交
2619 2620
static void ram_state_cleanup(RAMState **rsp)
{
2621 2622 2623 2624 2625 2626 2627
    if (*rsp) {
        migration_page_queue_free(*rsp);
        qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
        qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
        g_free(*rsp);
        *rsp = NULL;
    }
P
Peter Xu 已提交
2628 2629
}

2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645
static void xbzrle_cleanup(void)
{
    XBZRLE_cache_lock();
    if (XBZRLE.cache) {
        cache_fini(XBZRLE.cache);
        g_free(XBZRLE.encoded_buf);
        g_free(XBZRLE.current_buf);
        g_free(XBZRLE.zero_target_page);
        XBZRLE.cache = NULL;
        XBZRLE.encoded_buf = NULL;
        XBZRLE.current_buf = NULL;
        XBZRLE.zero_target_page = NULL;
    }
    XBZRLE_cache_unlock();
}

2646
static void ram_save_cleanup(void *opaque)
2647
{
J
Juan Quintela 已提交
2648
    RAMState **rsp = opaque;
2649
    RAMBlock *block;
2650

L
Li Zhijian 已提交
2651 2652 2653
    /* caller have hold iothread lock or is in a bh, so there is
     * no writing race against this migration_bitmap
     */
2654 2655
    memory_global_dirty_log_stop();

2656
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2657 2658 2659 2660
        g_free(block->bmap);
        block->bmap = NULL;
        g_free(block->unsentmap);
        block->unsentmap = NULL;
2661 2662
    }

2663
    xbzrle_cleanup();
2664
    compress_threads_save_cleanup();
P
Peter Xu 已提交
2665
    ram_state_cleanup(rsp);
2666 2667
}

J
Juan Quintela 已提交
2668
static void ram_state_reset(RAMState *rs)
2669
{
J
Juan Quintela 已提交
2670 2671
    rs->last_seen_block = NULL;
    rs->last_sent_block = NULL;
2672
    rs->last_page = 0;
J
Juan Quintela 已提交
2673 2674
    rs->last_version = ram_list.version;
    rs->ram_bulk_stage = true;
2675
    rs->fpo_enabled = false;
2676 2677 2678 2679
}

#define MAX_WAIT 50 /* ms, half buffered_file limit */

2680 2681 2682 2683 2684
/*
 * 'expected' is the value you expect the bitmap mostly to be full
 * of; it won't bother printing lines that are all this value.
 * If 'todump' is null the migration bitmap is dumped.
 */
2685 2686
void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
                           unsigned long pages)
2687 2688 2689 2690 2691
{
    int64_t cur;
    int64_t linelen = 128;
    char linebuf[129];

2692
    for (cur = 0; cur < pages; cur += linelen) {
2693 2694 2695 2696 2697 2698
        int64_t curb;
        bool found = false;
        /*
         * Last line; catch the case where the line length
         * is longer than remaining ram
         */
2699 2700
        if (cur + linelen > pages) {
            linelen = pages - cur;
2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713
        }
        for (curb = 0; curb < linelen; curb++) {
            bool thisbit = test_bit(cur + curb, todump);
            linebuf[curb] = thisbit ? '1' : '.';
            found = found || (thisbit != expected);
        }
        if (found) {
            linebuf[curb] = '\0';
            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
        }
    }
}

2714 2715
/* **** functions for postcopy ***** */

2716 2717 2718 2719
void ram_postcopy_migrated_memory_release(MigrationState *ms)
{
    struct RAMBlock *block;

2720
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2721 2722 2723
        unsigned long *bitmap = block->bmap;
        unsigned long range = block->used_length >> TARGET_PAGE_BITS;
        unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2724 2725 2726

        while (run_start < range) {
            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2727
            ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2728 2729 2730 2731 2732 2733
                              (run_end - run_start) << TARGET_PAGE_BITS);
            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
        }
    }
}

2734 2735 2736 2737 2738
/**
 * postcopy_send_discard_bm_ram: discard a RAMBlock
 *
 * Returns zero on success
 *
2739 2740 2741
 * Callback from postcopy_each_ram_send_discard for each RAMBlock
 * Note: At this point the 'unsentmap' is the processed bitmap combined
 *       with the dirtymap; so a '1' means it's either dirty or unsent.
2742 2743 2744 2745 2746
 *
 * @ms: current migration state
 * @pds: state for postcopy
 * @start: RAMBlock starting page
 * @length: RAMBlock size
2747 2748 2749
 */
static int postcopy_send_discard_bm_ram(MigrationState *ms,
                                        PostcopyDiscardState *pds,
2750
                                        RAMBlock *block)
2751
{
2752
    unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2753
    unsigned long current;
2754
    unsigned long *unsentmap = block->unsentmap;
2755

2756
    for (current = 0; current < end; ) {
2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767
        unsigned long one = find_next_bit(unsentmap, end, current);

        if (one <= end) {
            unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
            unsigned long discard_length;

            if (zero >= end) {
                discard_length = end - one;
            } else {
                discard_length = zero - one;
            }
2768 2769 2770
            if (discard_length) {
                postcopy_discard_send_range(ms, pds, one, discard_length);
            }
2771 2772 2773 2774 2775 2776 2777 2778 2779
            current = one + discard_length;
        } else {
            current = one;
        }
    }

    return 0;
}

2780 2781 2782 2783 2784
/**
 * postcopy_each_ram_send_discard: discard all RAMBlocks
 *
 * Returns 0 for success or negative for error
 *
2785 2786 2787 2788 2789
 * Utility for the outgoing postcopy code.
 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
 *   passing it bitmap indexes and name.
 * (qemu_ram_foreach_block ends up passing unscaled lengths
 *  which would mean postcopy code would have to deal with target page)
2790 2791
 *
 * @ms: current migration state
2792 2793 2794 2795 2796 2797
 */
static int postcopy_each_ram_send_discard(MigrationState *ms)
{
    struct RAMBlock *block;
    int ret;

2798
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2799 2800
        PostcopyDiscardState *pds =
            postcopy_discard_send_init(ms, block->idstr);
2801 2802 2803 2804 2805 2806

        /*
         * Postcopy sends chunks of bitmap over the wire, but it
         * just needs indexes at this point, avoids it having
         * target page specific code.
         */
2807
        ret = postcopy_send_discard_bm_ram(ms, pds, block);
2808 2809 2810 2811 2812 2813 2814 2815 2816
        postcopy_discard_send_finish(ms, pds);
        if (ret) {
            return ret;
        }
    }

    return 0;
}

2817 2818 2819 2820 2821 2822
/**
 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
 *
 * Helper for postcopy_chunk_hostpages; it's called twice to
 * canonicalize the two bitmaps, that are similar, but one is
 * inverted.
2823
 *
2824 2825
 * Postcopy requires that all target pages in a hostpage are dirty or
 * clean, not a mix.  This function canonicalizes the bitmaps.
2826
 *
2827 2828 2829 2830 2831
 * @ms: current migration state
 * @unsent_pass: if true we need to canonicalize partially unsent host pages
 *               otherwise we need to canonicalize partially dirty host pages
 * @block: block that contains the page we want to canonicalize
 * @pds: state for postcopy
2832 2833 2834 2835 2836
 */
static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
                                          RAMBlock *block,
                                          PostcopyDiscardState *pds)
{
J
Juan Quintela 已提交
2837
    RAMState *rs = ram_state;
2838 2839
    unsigned long *bitmap = block->bmap;
    unsigned long *unsentmap = block->unsentmap;
2840
    unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2841
    unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2842 2843
    unsigned long run_start;

2844 2845 2846 2847 2848
    if (block->page_size == TARGET_PAGE_SIZE) {
        /* Easy case - TPS==HPS for a non-huge page RAMBlock */
        return;
    }

2849 2850
    if (unsent_pass) {
        /* Find a sent page */
2851
        run_start = find_next_zero_bit(unsentmap, pages, 0);
2852 2853
    } else {
        /* Find a dirty page */
2854
        run_start = find_next_bit(bitmap, pages, 0);
2855 2856
    }

2857
    while (run_start < pages) {
2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876
        bool do_fixup = false;
        unsigned long fixup_start_addr;
        unsigned long host_offset;

        /*
         * If the start of this run of pages is in the middle of a host
         * page, then we need to fixup this host page.
         */
        host_offset = run_start % host_ratio;
        if (host_offset) {
            do_fixup = true;
            run_start -= host_offset;
            fixup_start_addr = run_start;
            /* For the next pass */
            run_start = run_start + host_ratio;
        } else {
            /* Find the end of this run */
            unsigned long run_end;
            if (unsent_pass) {
2877
                run_end = find_next_bit(unsentmap, pages, run_start + 1);
2878
            } else {
2879
                run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929
            }
            /*
             * If the end isn't at the start of a host page, then the
             * run doesn't finish at the end of a host page
             * and we need to discard.
             */
            host_offset = run_end % host_ratio;
            if (host_offset) {
                do_fixup = true;
                fixup_start_addr = run_end - host_offset;
                /*
                 * This host page has gone, the next loop iteration starts
                 * from after the fixup
                 */
                run_start = fixup_start_addr + host_ratio;
            } else {
                /*
                 * No discards on this iteration, next loop starts from
                 * next sent/dirty page
                 */
                run_start = run_end + 1;
            }
        }

        if (do_fixup) {
            unsigned long page;

            /* Tell the destination to discard this page */
            if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
                /* For the unsent_pass we:
                 *     discard partially sent pages
                 * For the !unsent_pass (dirty) we:
                 *     discard partially dirty pages that were sent
                 *     (any partially sent pages were already discarded
                 *     by the previous unsent_pass)
                 */
                postcopy_discard_send_range(ms, pds, fixup_start_addr,
                                            host_ratio);
            }

            /* Clean up the bitmap */
            for (page = fixup_start_addr;
                 page < fixup_start_addr + host_ratio; page++) {
                /* All pages in this host page are now not sent */
                set_bit(page, unsentmap);

                /*
                 * Remark them as dirty, updating the count for any pages
                 * that weren't previously dirty.
                 */
2930
                rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2931 2932 2933 2934 2935
            }
        }

        if (unsent_pass) {
            /* Find the next sent page for the next iteration */
2936
            run_start = find_next_zero_bit(unsentmap, pages, run_start);
2937 2938
        } else {
            /* Find the next dirty page for the next iteration */
2939
            run_start = find_next_bit(bitmap, pages, run_start);
2940 2941 2942 2943
        }
    }
}

2944 2945 2946
/**
 * postcopy_chuck_hostpages: discrad any partially sent host page
 *
2947 2948 2949
 * Utility for the outgoing postcopy code.
 *
 * Discard any partially sent host-page size chunks, mark any partially
2950 2951
 * dirty host-page size chunks as all dirty.  In this case the host-page
 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2952
 *
2953 2954 2955
 * Returns zero on success
 *
 * @ms: current migration state
2956
 * @block: block we want to work with
2957
 */
2958
static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2959
{
2960 2961
    PostcopyDiscardState *pds =
        postcopy_discard_send_init(ms, block->idstr);
2962

2963 2964 2965 2966 2967 2968 2969
    /* First pass: Discard all partially sent host pages */
    postcopy_chunk_hostpages_pass(ms, true, block, pds);
    /*
     * Second pass: Ensure that all partially dirty host pages are made
     * fully dirty.
     */
    postcopy_chunk_hostpages_pass(ms, false, block, pds);
2970

2971
    postcopy_discard_send_finish(ms, pds);
2972 2973 2974
    return 0;
}

2975 2976 2977 2978 2979
/**
 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
 *
 * Returns zero on success
 *
2980 2981 2982 2983 2984 2985 2986
 * Transmit the set of pages to be discarded after precopy to the target
 * these are pages that:
 *     a) Have been previously transmitted but are now dirty again
 *     b) Pages that have never been transmitted, this ensures that
 *        any pages on the destination that have been mapped by background
 *        tasks get discarded (transparent huge pages is the specific concern)
 * Hopefully this is pretty sparse
2987 2988
 *
 * @ms: current migration state
2989 2990 2991
 */
int ram_postcopy_send_discard_bitmap(MigrationState *ms)
{
J
Juan Quintela 已提交
2992
    RAMState *rs = ram_state;
2993
    RAMBlock *block;
2994 2995 2996 2997 2998
    int ret;

    rcu_read_lock();

    /* This should be our last sync, the src is now paused */
2999
    migration_bitmap_sync(rs);
3000

3001 3002 3003 3004
    /* Easiest way to make sure we don't resume in the middle of a host-page */
    rs->last_seen_block = NULL;
    rs->last_sent_block = NULL;
    rs->last_page = 0;
3005

3006
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025
        unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
        unsigned long *bitmap = block->bmap;
        unsigned long *unsentmap = block->unsentmap;

        if (!unsentmap) {
            /* We don't have a safe way to resize the sentmap, so
             * if the bitmap was resized it will be NULL at this
             * point.
             */
            error_report("migration ram resized during precopy phase");
            rcu_read_unlock();
            return -EINVAL;
        }
        /* Deal with TPS != HPS and huge pages */
        ret = postcopy_chunk_hostpages(ms, block);
        if (ret) {
            rcu_read_unlock();
            return ret;
        }
3026

3027 3028 3029 3030
        /*
         * Update the unsentmap to be unsentmap = unsentmap | dirty
         */
        bitmap_or(unsentmap, unsentmap, bitmap, pages);
3031
#ifdef DEBUG_POSTCOPY
3032
        ram_debug_dump_bitmap(unsentmap, true, pages);
3033
#endif
3034 3035
    }
    trace_ram_postcopy_send_discard_bitmap();
3036 3037 3038 3039 3040 3041 3042

    ret = postcopy_each_ram_send_discard(ms);
    rcu_read_unlock();

    return ret;
}

3043 3044
/**
 * ram_discard_range: discard dirtied pages at the beginning of postcopy
3045
 *
3046
 * Returns zero on success
3047
 *
J
Juan Quintela 已提交
3048 3049
 * @rbname: name of the RAMBlock of the request. NULL means the
 *          same that last one.
3050 3051
 * @start: RAMBlock starting page
 * @length: RAMBlock size
3052
 */
3053
int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3054 3055 3056
{
    int ret = -1;

J
Juan Quintela 已提交
3057
    trace_ram_discard_range(rbname, start, length);
3058

3059
    rcu_read_lock();
J
Juan Quintela 已提交
3060
    RAMBlock *rb = qemu_ram_block_by_name(rbname);
3061 3062

    if (!rb) {
J
Juan Quintela 已提交
3063
        error_report("ram_discard_range: Failed to find block '%s'", rbname);
3064 3065 3066
        goto err;
    }

3067 3068 3069 3070 3071 3072 3073 3074 3075
    /*
     * On source VM, we don't need to update the received bitmap since
     * we don't even have one.
     */
    if (rb->receivedmap) {
        bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
                     length >> qemu_target_page_bits());
    }

3076
    ret = ram_block_discard_range(rb, start, length);
3077 3078 3079 3080 3081 3082 3083

err:
    rcu_read_unlock();

    return ret;
}

3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140
/*
 * For every allocation, we will try not to crash the VM if the
 * allocation failed.
 */
static int xbzrle_init(void)
{
    Error *local_err = NULL;

    if (!migrate_use_xbzrle()) {
        return 0;
    }

    XBZRLE_cache_lock();

    XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
    if (!XBZRLE.zero_target_page) {
        error_report("%s: Error allocating zero page", __func__);
        goto err_out;
    }

    XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
                              TARGET_PAGE_SIZE, &local_err);
    if (!XBZRLE.cache) {
        error_report_err(local_err);
        goto free_zero_page;
    }

    XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
    if (!XBZRLE.encoded_buf) {
        error_report("%s: Error allocating encoded_buf", __func__);
        goto free_cache;
    }

    XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
    if (!XBZRLE.current_buf) {
        error_report("%s: Error allocating current_buf", __func__);
        goto free_encoded_buf;
    }

    /* We are all good */
    XBZRLE_cache_unlock();
    return 0;

free_encoded_buf:
    g_free(XBZRLE.encoded_buf);
    XBZRLE.encoded_buf = NULL;
free_cache:
    cache_fini(XBZRLE.cache);
    XBZRLE.cache = NULL;
free_zero_page:
    g_free(XBZRLE.zero_target_page);
    XBZRLE.zero_target_page = NULL;
err_out:
    XBZRLE_cache_unlock();
    return -ENOMEM;
}

J
Juan Quintela 已提交
3141
static int ram_state_init(RAMState **rsp)
3142
{
P
Peter Xu 已提交
3143 3144 3145 3146 3147 3148
    *rsp = g_try_new0(RAMState, 1);

    if (!*rsp) {
        error_report("%s: Init ramstate fail", __func__);
        return -1;
    }
J
Juan Quintela 已提交
3149 3150 3151 3152

    qemu_mutex_init(&(*rsp)->bitmap_mutex);
    qemu_mutex_init(&(*rsp)->src_page_req_mutex);
    QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3153

P
Peter Xu 已提交
3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164
    /*
     * Count the total number of pages used by ram blocks not including any
     * gaps due to alignment or unplugs.
     */
    (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;

    ram_state_reset(*rsp);

    return 0;
}

P
Peter Xu 已提交
3165
static void ram_list_init_bitmaps(void)
P
Peter Xu 已提交
3166
{
P
Peter Xu 已提交
3167 3168
    RAMBlock *block;
    unsigned long pages;
3169

3170 3171
    /* Skip setting bitmap if there is no RAM */
    if (ram_bytes_total()) {
3172
        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
P
Peter Xu 已提交
3173
            pages = block->max_length >> TARGET_PAGE_BITS;
3174 3175 3176 3177 3178 3179
            block->bmap = bitmap_new(pages);
            bitmap_set(block->bmap, 0, pages);
            if (migrate_postcopy_ram()) {
                block->unsentmap = bitmap_new(pages);
                bitmap_set(block->unsentmap, 0, pages);
            }
3180
        }
3181
    }
P
Peter Xu 已提交
3182 3183 3184 3185 3186 3187 3188 3189
}

static void ram_init_bitmaps(RAMState *rs)
{
    /* For memory_global_dirty_log_start below.  */
    qemu_mutex_lock_iothread();
    qemu_mutex_lock_ramlist();
    rcu_read_lock();
3190

P
Peter Xu 已提交
3191
    ram_list_init_bitmaps();
3192
    memory_global_dirty_log_start();
3193
    migration_bitmap_sync_precopy(rs);
P
Peter Xu 已提交
3194 3195

    rcu_read_unlock();
3196
    qemu_mutex_unlock_ramlist();
3197
    qemu_mutex_unlock_iothread();
P
Peter Xu 已提交
3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211
}

static int ram_init_all(RAMState **rsp)
{
    if (ram_state_init(rsp)) {
        return -1;
    }

    if (xbzrle_init()) {
        ram_state_cleanup(rsp);
        return -1;
    }

    ram_init_bitmaps(*rsp);
3212 3213 3214 3215

    return 0;
}

P
Peter Xu 已提交
3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226
static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
{
    RAMBlock *block;
    uint64_t pages = 0;

    /*
     * Postcopy is not using xbzrle/compression, so no need for that.
     * Also, since source are already halted, we don't need to care
     * about dirty page logging as well.
     */

3227
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
P
Peter Xu 已提交
3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250
        pages += bitmap_count_one(block->bmap,
                                  block->used_length >> TARGET_PAGE_BITS);
    }

    /* This may not be aligned with current bitmaps. Recalculate. */
    rs->migration_dirty_pages = pages;

    rs->last_seen_block = NULL;
    rs->last_sent_block = NULL;
    rs->last_page = 0;
    rs->last_version = ram_list.version;
    /*
     * Disable the bulk stage, otherwise we'll resend the whole RAM no
     * matter what we have sent.
     */
    rs->ram_bulk_stage = false;

    /* Update RAMState cache of output QEMUFile */
    rs->f = out;

    trace_ram_state_resume_prepare(pages);
}

3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297
/*
 * This function clears bits of the free pages reported by the caller from the
 * migration dirty bitmap. @addr is the host address corresponding to the
 * start of the continuous guest free pages, and @len is the total bytes of
 * those pages.
 */
void qemu_guest_free_page_hint(void *addr, size_t len)
{
    RAMBlock *block;
    ram_addr_t offset;
    size_t used_len, start, npages;
    MigrationState *s = migrate_get_current();

    /* This function is currently expected to be used during live migration */
    if (!migration_is_setup_or_active(s->state)) {
        return;
    }

    for (; len > 0; len -= used_len, addr += used_len) {
        block = qemu_ram_block_from_host(addr, false, &offset);
        if (unlikely(!block || offset >= block->used_length)) {
            /*
             * The implementation might not support RAMBlock resize during
             * live migration, but it could happen in theory with future
             * updates. So we add a check here to capture that case.
             */
            error_report_once("%s unexpected error", __func__);
            return;
        }

        if (len <= block->used_length - offset) {
            used_len = len;
        } else {
            used_len = block->used_length - offset;
        }

        start = offset >> TARGET_PAGE_BITS;
        npages = used_len >> TARGET_PAGE_BITS;

        qemu_mutex_lock(&ram_state->bitmap_mutex);
        ram_state->migration_dirty_pages -=
                      bitmap_count_one_with_offset(block->bmap, start, npages);
        bitmap_clear(block->bmap, start, npages);
        qemu_mutex_unlock(&ram_state->bitmap_mutex);
    }
}

3298 3299
/*
 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3300 3301 3302 3303 3304
 * long-running RCU critical section.  When rcu-reclaims in the code
 * start to become numerous it will be necessary to reduce the
 * granularity of these critical sections.
 */

3305 3306 3307 3308 3309 3310 3311 3312
/**
 * ram_save_setup: Setup RAM for migration
 *
 * Returns zero to indicate success and negative for error
 *
 * @f: QEMUFile where to send the data
 * @opaque: RAMState pointer
 */
3313 3314
static int ram_save_setup(QEMUFile *f, void *opaque)
{
J
Juan Quintela 已提交
3315
    RAMState **rsp = opaque;
3316 3317
    RAMBlock *block;

3318 3319 3320 3321
    if (compress_threads_save_setup()) {
        return -1;
    }

3322 3323
    /* migration has already setup the bitmap, reuse it. */
    if (!migration_in_colo_state()) {
P
Peter Xu 已提交
3324
        if (ram_init_all(rsp) != 0) {
3325
            compress_threads_save_cleanup();
3326
            return -1;
J
Juan Quintela 已提交
3327
        }
3328
    }
J
Juan Quintela 已提交
3329
    (*rsp)->f = f;
3330 3331

    rcu_read_lock();
3332

3333
    qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3334

3335
    RAMBLOCK_FOREACH_MIGRATABLE(block) {
3336 3337 3338
        qemu_put_byte(f, strlen(block->idstr));
        qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
        qemu_put_be64(f, block->used_length);
3339 3340 3341
        if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
            qemu_put_be64(f, block->page_size);
        }
3342 3343 3344 3345
        if (migrate_ignore_shared()) {
            qemu_put_be64(f, block->mr->addr);
            qemu_put_byte(f, ramblock_is_ignored(block) ? 1 : 0);
        }
3346 3347 3348 3349 3350 3351 3352
    }

    rcu_read_unlock();

    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
    ram_control_after_iterate(f, RAM_CONTROL_SETUP);

3353
    multifd_send_sync_main();
3354
    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3355
    qemu_fflush(f);
3356 3357 3358 3359

    return 0;
}

3360 3361 3362 3363 3364 3365 3366 3367
/**
 * ram_save_iterate: iterative stage for migration
 *
 * Returns zero to indicate success and negative for error
 *
 * @f: QEMUFile where to send the data
 * @opaque: RAMState pointer
 */
3368 3369
static int ram_save_iterate(QEMUFile *f, void *opaque)
{
J
Juan Quintela 已提交
3370 3371
    RAMState **temp = opaque;
    RAMState *rs = *temp;
3372 3373 3374
    int ret;
    int i;
    int64_t t0;
3375
    int done = 0;
3376

3377 3378 3379 3380 3381 3382 3383
    if (blk_mig_bulk_active()) {
        /* Avoid transferring ram during bulk phase of block migration as
         * the bulk phase will usually take a long time and transferring
         * ram updates during that time is pointless. */
        goto out;
    }

3384
    rcu_read_lock();
J
Juan Quintela 已提交
3385 3386
    if (ram_list.version != rs->last_version) {
        ram_state_reset(rs);
3387 3388 3389 3390 3391 3392 3393 3394 3395
    }

    /* Read version before ram_list.blocks */
    smp_rmb();

    ram_control_before_iterate(f, RAM_CONTROL_ROUND);

    t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
    i = 0;
3396 3397
    while ((ret = qemu_file_rate_limit(f)) == 0 ||
            !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3398 3399
        int pages;

3400 3401 3402 3403
        if (qemu_file_get_error(f)) {
            break;
        }

J
Juan Quintela 已提交
3404
        pages = ram_find_and_save_block(rs, false);
3405 3406
        /* no more pages to sent */
        if (pages == 0) {
3407
            done = 1;
3408 3409
            break;
        }
3410 3411 3412 3413 3414 3415

        if (pages < 0) {
            qemu_file_set_error(f, pages);
            break;
        }

3416
        rs->target_page_count += pages;
3417

3418 3419 3420 3421 3422 3423 3424 3425
        /* we want to check in the 1st loop, just in case it was the 1st time
           and we had to sync the dirty bitmap.
           qemu_get_clock_ns() is a bit expensive, so we only check each some
           iterations
        */
        if ((i & 63) == 0) {
            uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
            if (t1 > MAX_WAIT) {
3426
                trace_ram_save_iterate_big_wait(t1, i);
3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439
                break;
            }
        }
        i++;
    }
    rcu_read_unlock();

    /*
     * Must occur before EOS (or any QEMUFile operation)
     * because of RDMA protocol.
     */
    ram_control_after_iterate(f, RAM_CONTROL_ROUND);

3440
    multifd_send_sync_main();
3441
out:
3442
    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3443
    qemu_fflush(f);
3444
    ram_counters.transferred += 8;
3445 3446 3447 3448 3449 3450

    ret = qemu_file_get_error(f);
    if (ret < 0) {
        return ret;
    }

3451
    return done;
3452 3453
}

3454 3455 3456
/**
 * ram_save_complete: function called to send the remaining amount of ram
 *
3457
 * Returns zero to indicate success or negative on error
3458 3459 3460 3461 3462 3463
 *
 * Called with iothread lock
 *
 * @f: QEMUFile where to send the data
 * @opaque: RAMState pointer
 */
3464 3465
static int ram_save_complete(QEMUFile *f, void *opaque)
{
J
Juan Quintela 已提交
3466 3467
    RAMState **temp = opaque;
    RAMState *rs = *temp;
3468
    int ret = 0;
J
Juan Quintela 已提交
3469

3470 3471
    rcu_read_lock();

3472
    if (!migration_in_postcopy()) {
3473
        migration_bitmap_sync_precopy(rs);
3474
    }
3475 3476 3477 3478 3479 3480 3481 3482 3483

    ram_control_before_iterate(f, RAM_CONTROL_FINISH);

    /* try transferring iterative blocks of memory */

    /* flush all remaining blocks regardless of rate limiting */
    while (true) {
        int pages;

J
Juan Quintela 已提交
3484
        pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3485 3486 3487 3488
        /* no more blocks to sent */
        if (pages == 0) {
            break;
        }
3489 3490 3491 3492
        if (pages < 0) {
            ret = pages;
            break;
        }
3493 3494
    }

J
Juan Quintela 已提交
3495
    flush_compressed_data(rs);
3496 3497 3498
    ram_control_after_iterate(f, RAM_CONTROL_FINISH);

    rcu_read_unlock();
P
Paolo Bonzini 已提交
3499

3500
    multifd_send_sync_main();
3501
    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3502
    qemu_fflush(f);
3503

3504
    return ret;
3505 3506
}

3507
static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3508 3509 3510
                             uint64_t *res_precopy_only,
                             uint64_t *res_compatible,
                             uint64_t *res_postcopy_only)
3511
{
J
Juan Quintela 已提交
3512 3513
    RAMState **temp = opaque;
    RAMState *rs = *temp;
3514 3515
    uint64_t remaining_size;

J
Juan Quintela 已提交
3516
    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3517

3518
    if (!migration_in_postcopy() &&
3519
        remaining_size < max_size) {
3520 3521
        qemu_mutex_lock_iothread();
        rcu_read_lock();
3522
        migration_bitmap_sync_precopy(rs);
3523 3524
        rcu_read_unlock();
        qemu_mutex_unlock_iothread();
J
Juan Quintela 已提交
3525
        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3526
    }
3527

3528 3529
    if (migrate_postcopy_ram()) {
        /* We can do postcopy, and all the data is postcopiable */
3530
        *res_compatible += remaining_size;
3531
    } else {
3532
        *res_precopy_only += remaining_size;
3533
    }
3534 3535 3536 3537 3538 3539
}

static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
{
    unsigned int xh_len;
    int xh_flags;
3540
    uint8_t *loaded_data;
3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554

    /* extract RLE header */
    xh_flags = qemu_get_byte(f);
    xh_len = qemu_get_be16(f);

    if (xh_flags != ENCODING_FLAG_XBZRLE) {
        error_report("Failed to load XBZRLE page - wrong compression!");
        return -1;
    }

    if (xh_len > TARGET_PAGE_SIZE) {
        error_report("Failed to load XBZRLE page - len overflow!");
        return -1;
    }
3555
    loaded_data = XBZRLE.decoded_buf;
3556
    /* load data and decode */
3557
    /* it can change loaded_data to point to an internal buffer */
3558
    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3559 3560

    /* decode RLE */
3561
    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3562 3563 3564 3565 3566 3567 3568 3569
                             TARGET_PAGE_SIZE) == -1) {
        error_report("Failed to load XBZRLE page - decode error!");
        return -1;
    }

    return 0;
}

3570 3571 3572 3573 3574
/**
 * ram_block_from_stream: read a RAMBlock id from the migration stream
 *
 * Must be called from within a rcu critical section.
 *
3575
 * Returns a pointer from within the RCU-protected ram_list.
3576
 *
3577 3578
 * @f: QEMUFile where to read the data from
 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3579
 */
3580
static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3581 3582 3583 3584 3585 3586
{
    static RAMBlock *block = NULL;
    char id[256];
    uint8_t len;

    if (flags & RAM_SAVE_FLAG_CONTINUE) {
3587
        if (!block) {
3588 3589 3590
            error_report("Ack, bad migration stream!");
            return NULL;
        }
3591
        return block;
3592 3593 3594 3595 3596 3597
    }

    len = qemu_get_byte(f);
    qemu_get_buffer(f, (uint8_t *)id, len);
    id[len] = 0;

D
Dr. David Alan Gilbert 已提交
3598
    block = qemu_ram_block_by_name(id);
3599 3600 3601
    if (!block) {
        error_report("Can't find block %s", id);
        return NULL;
3602 3603
    }

3604
    if (ramblock_is_ignored(block)) {
3605 3606 3607 3608
        error_report("block %s should not be migrated !", id);
        return NULL;
    }

3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619
    return block;
}

static inline void *host_from_ram_block_offset(RAMBlock *block,
                                               ram_addr_t offset)
{
    if (!offset_in_ramblock(block, offset)) {
        return NULL;
    }

    return block->host + offset;
3620 3621
}

3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632
static inline void *colo_cache_from_block_offset(RAMBlock *block,
                                                 ram_addr_t offset)
{
    if (!offset_in_ramblock(block, offset)) {
        return NULL;
    }
    if (!block->colo_cache) {
        error_report("%s: colo_cache is NULL in block :%s",
                     __func__, block->idstr);
        return NULL;
    }
3633 3634 3635 3636 3637 3638 3639 3640 3641

    /*
    * During colo checkpoint, we need bitmap of these migrated pages.
    * It help us to decide which pages in ram cache should be flushed
    * into VM's RAM later.
    */
    if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
        ram_state->migration_dirty_pages++;
    }
3642 3643 3644
    return block->colo_cache + offset;
}

3645 3646 3647
/**
 * ram_handle_compressed: handle the zero page case
 *
3648 3649
 * If a page (or a whole RDMA chunk) has been
 * determined to be zero, then zap it.
3650 3651 3652 3653
 *
 * @host: host address for the zero page
 * @ch: what the page is filled from.  We only support zero
 * @size: size of the zero page
3654 3655 3656 3657 3658 3659 3660 3661
 */
void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
{
    if (ch != 0 || !is_zero_range(host, size)) {
        memset(host, ch, size);
    }
}

3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686
/* return the size after decompression, or negative value on error */
static int
qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
                     const uint8_t *source, size_t source_len)
{
    int err;

    err = inflateReset(stream);
    if (err != Z_OK) {
        return -1;
    }

    stream->avail_in = source_len;
    stream->next_in = (uint8_t *)source;
    stream->avail_out = dest_len;
    stream->next_out = dest;

    err = inflate(stream, Z_NO_FLUSH);
    if (err != Z_STREAM_END) {
        return -1;
    }

    return stream->total_out;
}

3687 3688 3689 3690
static void *do_data_decompress(void *opaque)
{
    DecompressParam *param = opaque;
    unsigned long pagesize;
3691
    uint8_t *des;
3692
    int len, ret;
3693

3694
    qemu_mutex_lock(&param->mutex);
3695
    while (!param->quit) {
3696 3697 3698 3699 3700 3701
        if (param->des) {
            des = param->des;
            len = param->len;
            param->des = 0;
            qemu_mutex_unlock(&param->mutex);

3702
            pagesize = TARGET_PAGE_SIZE;
3703 3704 3705

            ret = qemu_uncompress_data(&param->stream, des, pagesize,
                                       param->compbuf, len);
3706
            if (ret < 0 && migrate_get_current()->decompress_error_check) {
3707 3708 3709
                error_report("decompress data failed");
                qemu_file_set_error(decomp_file, ret);
            }
3710

3711 3712 3713 3714 3715 3716 3717 3718 3719
            qemu_mutex_lock(&decomp_done_lock);
            param->done = true;
            qemu_cond_signal(&decomp_done_cond);
            qemu_mutex_unlock(&decomp_done_lock);

            qemu_mutex_lock(&param->mutex);
        } else {
            qemu_cond_wait(&param->cond, &param->mutex);
        }
3720
    }
3721
    qemu_mutex_unlock(&param->mutex);
3722 3723 3724 3725

    return NULL;
}

3726
static int wait_for_decompress_done(void)
L
Liang Li 已提交
3727 3728 3729 3730
{
    int idx, thread_count;

    if (!migrate_use_compression()) {
3731
        return 0;
L
Liang Li 已提交
3732 3733 3734 3735 3736 3737 3738 3739 3740 3741
    }

    thread_count = migrate_decompress_threads();
    qemu_mutex_lock(&decomp_done_lock);
    for (idx = 0; idx < thread_count; idx++) {
        while (!decomp_param[idx].done) {
            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
        }
    }
    qemu_mutex_unlock(&decomp_done_lock);
3742
    return qemu_file_get_error(decomp_file);
L
Liang Li 已提交
3743 3744
}

3745
static void compress_threads_load_cleanup(void)
3746 3747 3748
{
    int i, thread_count;

3749 3750 3751
    if (!migrate_use_compression()) {
        return;
    }
3752 3753
    thread_count = migrate_decompress_threads();
    for (i = 0; i < thread_count; i++) {
3754 3755 3756 3757 3758 3759 3760 3761
        /*
         * we use it as a indicator which shows if the thread is
         * properly init'd or not
         */
        if (!decomp_param[i].compbuf) {
            break;
        }

3762
        qemu_mutex_lock(&decomp_param[i].mutex);
3763
        decomp_param[i].quit = true;
3764 3765 3766 3767
        qemu_cond_signal(&decomp_param[i].cond);
        qemu_mutex_unlock(&decomp_param[i].mutex);
    }
    for (i = 0; i < thread_count; i++) {
3768 3769 3770 3771
        if (!decomp_param[i].compbuf) {
            break;
        }

3772 3773 3774
        qemu_thread_join(decompress_threads + i);
        qemu_mutex_destroy(&decomp_param[i].mutex);
        qemu_cond_destroy(&decomp_param[i].cond);
3775
        inflateEnd(&decomp_param[i].stream);
3776
        g_free(decomp_param[i].compbuf);
3777
        decomp_param[i].compbuf = NULL;
3778 3779 3780 3781 3782
    }
    g_free(decompress_threads);
    g_free(decomp_param);
    decompress_threads = NULL;
    decomp_param = NULL;
3783
    decomp_file = NULL;
3784 3785
}

3786
static int compress_threads_load_setup(QEMUFile *f)
3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798
{
    int i, thread_count;

    if (!migrate_use_compression()) {
        return 0;
    }

    thread_count = migrate_decompress_threads();
    decompress_threads = g_new0(QemuThread, thread_count);
    decomp_param = g_new0(DecompressParam, thread_count);
    qemu_mutex_init(&decomp_done_lock);
    qemu_cond_init(&decomp_done_cond);
3799
    decomp_file = f;
3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819
    for (i = 0; i < thread_count; i++) {
        if (inflateInit(&decomp_param[i].stream) != Z_OK) {
            goto exit;
        }

        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
        qemu_mutex_init(&decomp_param[i].mutex);
        qemu_cond_init(&decomp_param[i].cond);
        decomp_param[i].done = true;
        decomp_param[i].quit = false;
        qemu_thread_create(decompress_threads + i, "decompress",
                           do_data_decompress, decomp_param + i,
                           QEMU_THREAD_JOINABLE);
    }
    return 0;
exit:
    compress_threads_load_cleanup();
    return -1;
}

3820
static void decompress_data_with_multi_threads(QEMUFile *f,
3821 3822 3823 3824 3825
                                               void *host, int len)
{
    int idx, thread_count;

    thread_count = migrate_decompress_threads();
3826
    qemu_mutex_lock(&decomp_done_lock);
3827 3828
    while (true) {
        for (idx = 0; idx < thread_count; idx++) {
3829
            if (decomp_param[idx].done) {
3830 3831
                decomp_param[idx].done = false;
                qemu_mutex_lock(&decomp_param[idx].mutex);
3832
                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3833 3834
                decomp_param[idx].des = host;
                decomp_param[idx].len = len;
3835 3836
                qemu_cond_signal(&decomp_param[idx].cond);
                qemu_mutex_unlock(&decomp_param[idx].mutex);
3837 3838 3839 3840 3841
                break;
            }
        }
        if (idx < thread_count) {
            break;
3842 3843
        } else {
            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3844 3845
        }
    }
3846
    qemu_mutex_unlock(&decomp_done_lock);
3847 3848
}

3849 3850 3851 3852 3853 3854 3855 3856 3857 3858
/*
 * colo cache: this is for secondary VM, we cache the whole
 * memory of the secondary VM, it is need to hold the global lock
 * to call this helper.
 */
int colo_init_ram_cache(void)
{
    RAMBlock *block;

    rcu_read_lock();
3859
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871
        block->colo_cache = qemu_anon_ram_alloc(block->used_length,
                                                NULL,
                                                false);
        if (!block->colo_cache) {
            error_report("%s: Can't alloc memory for COLO cache of block %s,"
                         "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
                         block->used_length);
            goto out_locked;
        }
        memcpy(block->colo_cache, block->host, block->used_length);
    }
    rcu_read_unlock();
3872 3873 3874 3875 3876 3877 3878 3879
    /*
    * Record the dirty pages that sent by PVM, we use this dirty bitmap together
    * with to decide which page in cache should be flushed into SVM's RAM. Here
    * we use the same name 'ram_bitmap' as for migration.
    */
    if (ram_bytes_total()) {
        RAMBlock *block;

3880
        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3881 3882 3883 3884 3885 3886 3887 3888
            unsigned long pages = block->max_length >> TARGET_PAGE_BITS;

            block->bmap = bitmap_new(pages);
            bitmap_set(block->bmap, 0, pages);
        }
    }
    ram_state = g_new0(RAMState, 1);
    ram_state->migration_dirty_pages = 0;
3889
    memory_global_dirty_log_start();
3890

3891 3892 3893
    return 0;

out_locked:
3894

3895
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910
        if (block->colo_cache) {
            qemu_anon_ram_free(block->colo_cache, block->used_length);
            block->colo_cache = NULL;
        }
    }

    rcu_read_unlock();
    return -errno;
}

/* It is need to hold the global lock to call this helper */
void colo_release_ram_cache(void)
{
    RAMBlock *block;

3911
    memory_global_dirty_log_stop();
3912
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3913 3914 3915 3916
        g_free(block->bmap);
        block->bmap = NULL;
    }

3917
    rcu_read_lock();
3918

3919
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3920 3921 3922 3923 3924
        if (block->colo_cache) {
            qemu_anon_ram_free(block->colo_cache, block->used_length);
            block->colo_cache = NULL;
        }
    }
3925

3926
    rcu_read_unlock();
3927 3928
    g_free(ram_state);
    ram_state = NULL;
3929 3930
}

3931 3932 3933 3934 3935 3936 3937 3938 3939 3940
/**
 * ram_load_setup: Setup RAM for migration incoming side
 *
 * Returns zero to indicate success and negative for error
 *
 * @f: QEMUFile where to receive the data
 * @opaque: RAMState pointer
 */
static int ram_load_setup(QEMUFile *f, void *opaque)
{
3941
    if (compress_threads_load_setup(f)) {
3942 3943 3944
        return -1;
    }

3945
    xbzrle_load_setup();
3946
    ramblock_recv_map_init();
3947

3948 3949 3950 3951 3952
    return 0;
}

static int ram_load_cleanup(void *opaque)
{
3953
    RAMBlock *rb;
3954

3955
    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3956 3957 3958 3959 3960
        if (ramblock_is_pmem(rb)) {
            pmem_persist(rb->host, rb->used_length);
        }
    }

3961
    xbzrle_load_cleanup();
3962
    compress_threads_load_cleanup();
3963

3964
    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3965 3966 3967
        g_free(rb->receivedmap);
        rb->receivedmap = NULL;
    }
3968

3969 3970 3971
    return 0;
}

3972 3973 3974 3975 3976 3977 3978 3979 3980 3981
/**
 * ram_postcopy_incoming_init: allocate postcopy data structures
 *
 * Returns 0 for success and negative if there was one error
 *
 * @mis: current migration incoming state
 *
 * Allocate data structures etc needed by incoming migration with
 * postcopy-ram. postcopy-ram's similarly names
 * postcopy_ram_incoming_init does the work.
3982 3983 3984
 */
int ram_postcopy_incoming_init(MigrationIncomingState *mis)
{
3985
    return postcopy_ram_incoming_init(mis);
3986 3987
}

3988 3989 3990 3991 3992
/**
 * ram_load_postcopy: load a page in postcopy case
 *
 * Returns 0 for success or -errno in case of error
 *
3993 3994
 * Called in postcopy mode by ram_load().
 * rcu_read_lock is taken prior to this being called.
3995 3996
 *
 * @f: QEMUFile where to send the data
3997 3998 3999 4000 4001
 */
static int ram_load_postcopy(QEMUFile *f)
{
    int flags = 0, ret = 0;
    bool place_needed = false;
4002
    bool matches_target_page_size = false;
4003 4004 4005
    MigrationIncomingState *mis = migration_incoming_get_current();
    /* Temporary page that is later 'placed' */
    void *postcopy_host_page = postcopy_get_tmp_page(mis);
4006
    void *last_host = NULL;
4007
    bool all_zero = false;
4008 4009 4010 4011 4012 4013

    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
        ram_addr_t addr;
        void *host = NULL;
        void *page_buffer = NULL;
        void *place_source = NULL;
4014
        RAMBlock *block = NULL;
4015 4016 4017
        uint8_t ch;

        addr = qemu_get_be64(f);
4018 4019 4020 4021 4022 4023 4024 4025 4026 4027

        /*
         * If qemu file error, we should stop here, and then "addr"
         * may be invalid
         */
        ret = qemu_file_get_error(f);
        if (ret) {
            break;
        }

4028 4029 4030 4031 4032
        flags = addr & ~TARGET_PAGE_MASK;
        addr &= TARGET_PAGE_MASK;

        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
        place_needed = false;
4033
        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4034
            block = ram_block_from_stream(f, flags);
4035 4036

            host = host_from_ram_block_offset(block, addr);
4037 4038 4039 4040 4041
            if (!host) {
                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
                ret = -EINVAL;
                break;
            }
4042
            matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4043
            /*
4044 4045 4046
             * Postcopy requires that we place whole host pages atomically;
             * these may be huge pages for RAMBlocks that are backed by
             * hugetlbfs.
4047 4048 4049 4050 4051 4052 4053
             * To make it atomic, the data is read into a temporary page
             * that's moved into place later.
             * The migration protocol uses,  possibly smaller, target-pages
             * however the source ensures it always sends all the components
             * of a host page in order.
             */
            page_buffer = postcopy_host_page +
4054
                          ((uintptr_t)host & (block->page_size - 1));
4055
            /* If all TP are zero then we can optimise the place */
4056
            if (!((uintptr_t)host & (block->page_size - 1))) {
4057
                all_zero = true;
4058 4059 4060
            } else {
                /* not the 1st TP within the HP */
                if (host != (last_host + TARGET_PAGE_SIZE)) {
4061
                    error_report("Non-sequential target page %p/%p",
4062 4063 4064 4065
                                  host, last_host);
                    ret = -EINVAL;
                    break;
                }
4066 4067
            }

4068

4069 4070 4071 4072 4073
            /*
             * If it's the last part of a host page then we place the host
             * page
             */
            place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4074
                                     (block->page_size - 1)) == 0;
4075 4076
            place_source = postcopy_host_page;
        }
4077
        last_host = host;
4078 4079

        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4080
        case RAM_SAVE_FLAG_ZERO:
4081 4082 4083 4084 4085 4086 4087 4088 4089
            ch = qemu_get_byte(f);
            memset(page_buffer, ch, TARGET_PAGE_SIZE);
            if (ch) {
                all_zero = false;
            }
            break;

        case RAM_SAVE_FLAG_PAGE:
            all_zero = false;
4090 4091
            if (!matches_target_page_size) {
                /* For huge pages, we always use temporary buffer */
4092 4093
                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
            } else {
4094 4095 4096 4097 4098 4099 4100
                /*
                 * For small pages that matches target page size, we
                 * avoid the qemu_file copy.  Instead we directly use
                 * the buffer of QEMUFile to place the page.  Note: we
                 * cannot do any QEMUFile operation before using that
                 * buffer to make sure the buffer is valid when
                 * placing the page.
4101 4102 4103 4104 4105 4106 4107
                 */
                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
                                         TARGET_PAGE_SIZE);
            }
            break;
        case RAM_SAVE_FLAG_EOS:
            /* normal exit */
4108
            multifd_recv_sync_main();
4109 4110 4111 4112 4113
            break;
        default:
            error_report("Unknown combination of migration flags: %#x"
                         " (postcopy mode)", flags);
            ret = -EINVAL;
4114 4115 4116 4117 4118 4119
            break;
        }

        /* Detect for any possible file errors */
        if (!ret && qemu_file_get_error(f)) {
            ret = qemu_file_get_error(f);
4120 4121
        }

4122
        if (!ret && place_needed) {
4123
            /* This gets called at the last target page in the host page */
4124 4125
            void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;

4126
            if (all_zero) {
4127
                ret = postcopy_place_page_zero(mis, place_dest,
4128
                                               block);
4129
            } else {
4130
                ret = postcopy_place_page(mis, place_dest,
4131
                                          place_source, block);
4132 4133 4134 4135 4136 4137 4138
            }
        }
    }

    return ret;
}

4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150
static bool postcopy_is_advised(void)
{
    PostcopyState ps = postcopy_state_get();
    return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
}

static bool postcopy_is_running(void)
{
    PostcopyState ps = postcopy_state_get();
    return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
}

4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161
/*
 * Flush content of RAM cache into SVM's memory.
 * Only flush the pages that be dirtied by PVM or SVM or both.
 */
static void colo_flush_ram_cache(void)
{
    RAMBlock *block = NULL;
    void *dst_host;
    void *src_host;
    unsigned long offset = 0;

4162 4163
    memory_global_dirty_log_sync();
    rcu_read_lock();
4164
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4165 4166 4167 4168
        migration_bitmap_sync_range(ram_state, block, 0, block->used_length);
    }
    rcu_read_unlock();

4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190
    trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
    rcu_read_lock();
    block = QLIST_FIRST_RCU(&ram_list.blocks);

    while (block) {
        offset = migration_bitmap_find_dirty(ram_state, block, offset);

        if (offset << TARGET_PAGE_BITS >= block->used_length) {
            offset = 0;
            block = QLIST_NEXT_RCU(block, next);
        } else {
            migration_bitmap_clear_dirty(ram_state, block, offset);
            dst_host = block->host + (offset << TARGET_PAGE_BITS);
            src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
            memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
        }
    }

    rcu_read_unlock();
    trace_colo_flush_ram_cache_end();
}

4191 4192
static int ram_load(QEMUFile *f, void *opaque, int version_id)
{
4193
    int flags = 0, ret = 0, invalid_flags = 0;
4194 4195
    static uint64_t seq_iter;
    int len = 0;
4196 4197 4198 4199
    /*
     * If system is running in postcopy mode, page inserts to host memory must
     * be atomic
     */
4200
    bool postcopy_running = postcopy_is_running();
4201
    /* ADVISE is earlier, it shows the source has the postcopy capability on */
4202
    bool postcopy_advised = postcopy_is_advised();
4203 4204 4205 4206 4207 4208 4209

    seq_iter++;

    if (version_id != 4) {
        ret = -EINVAL;
    }

4210 4211 4212
    if (!migrate_use_compression()) {
        invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
    }
4213 4214 4215 4216 4217 4218
    /* This RCU critical section can be very long running.
     * When RCU reclaims in the code start to become numerous,
     * it will be necessary to reduce the granularity of this
     * critical section.
     */
    rcu_read_lock();
4219 4220 4221 4222 4223 4224

    if (postcopy_running) {
        ret = ram_load_postcopy(f);
    }

    while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4225
        ram_addr_t addr, total_ram_bytes;
4226
        void *host = NULL;
4227 4228 4229 4230 4231 4232
        uint8_t ch;

        addr = qemu_get_be64(f);
        flags = addr & ~TARGET_PAGE_MASK;
        addr &= TARGET_PAGE_MASK;

4233 4234 4235 4236 4237 4238 4239 4240 4241
        if (flags & invalid_flags) {
            if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
                error_report("Received an unexpected compressed page");
            }

            ret = -EINVAL;
            break;
        }

4242
        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4243
                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4244 4245
            RAMBlock *block = ram_block_from_stream(f, flags);

4246 4247 4248 4249 4250 4251 4252 4253
            /*
             * After going into COLO, we should load the Page into colo_cache.
             */
            if (migration_incoming_in_colo_state()) {
                host = colo_cache_from_block_offset(block, addr);
            } else {
                host = host_from_ram_block_offset(block, addr);
            }
4254 4255 4256 4257 4258
            if (!host) {
                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
                ret = -EINVAL;
                break;
            }
4259 4260 4261 4262 4263

            if (!migration_incoming_in_colo_state()) {
                ramblock_recv_bitmap_set(block, host);
            }

4264
            trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4265 4266
        }

4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280
        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
        case RAM_SAVE_FLAG_MEM_SIZE:
            /* Synchronize RAM block list */
            total_ram_bytes = addr;
            while (!ret && total_ram_bytes) {
                RAMBlock *block;
                char id[256];
                ram_addr_t length;

                len = qemu_get_byte(f);
                qemu_get_buffer(f, (uint8_t *)id, len);
                id[len] = 0;
                length = qemu_get_be64(f);

D
Dr. David Alan Gilbert 已提交
4281
                block = qemu_ram_block_by_name(id);
4282 4283 4284 4285
                if (block && !qemu_ram_is_migratable(block)) {
                    error_report("block %s should not be migrated !", id);
                    ret = -EINVAL;
                } else if (block) {
D
Dr. David Alan Gilbert 已提交
4286 4287
                    if (length != block->used_length) {
                        Error *local_err = NULL;
4288

G
Gonglei 已提交
4289
                        ret = qemu_ram_resize(block, length,
D
Dr. David Alan Gilbert 已提交
4290 4291 4292
                                              &local_err);
                        if (local_err) {
                            error_report_err(local_err);
4293 4294
                        }
                    }
4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306
                    /* For postcopy we need to check hugepage sizes match */
                    if (postcopy_advised &&
                        block->page_size != qemu_host_page_size) {
                        uint64_t remote_page_size = qemu_get_be64(f);
                        if (remote_page_size != block->page_size) {
                            error_report("Mismatched RAM page size %s "
                                         "(local) %zd != %" PRId64,
                                         id, block->page_size,
                                         remote_page_size);
                            ret = -EINVAL;
                        }
                    }
4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323
                    if (migrate_ignore_shared()) {
                        hwaddr addr = qemu_get_be64(f);
                        bool ignored = qemu_get_byte(f);
                        if (ignored != ramblock_is_ignored(block)) {
                            error_report("RAM block %s should %s be migrated",
                                         id, ignored ? "" : "not");
                            ret = -EINVAL;
                        }
                        if (ramblock_is_ignored(block) &&
                            block->mr->addr != addr) {
                            error_report("Mismatched GPAs for block %s "
                                         "%" PRId64 "!= %" PRId64,
                                         id, (uint64_t)addr,
                                         (uint64_t)block->mr->addr);
                            ret = -EINVAL;
                        }
                    }
D
Dr. David Alan Gilbert 已提交
4324 4325 4326
                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
                                          block->idstr);
                } else {
4327 4328 4329 4330 4331 4332 4333 4334
                    error_report("Unknown ramblock \"%s\", cannot "
                                 "accept migration", id);
                    ret = -EINVAL;
                }

                total_ram_bytes -= length;
            }
            break;
4335

4336
        case RAM_SAVE_FLAG_ZERO:
4337 4338 4339
            ch = qemu_get_byte(f);
            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
            break;
4340

4341 4342 4343 4344
        case RAM_SAVE_FLAG_PAGE:
            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
            break;

4345
        case RAM_SAVE_FLAG_COMPRESS_PAGE:
4346 4347 4348 4349 4350 4351
            len = qemu_get_be32(f);
            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
                error_report("Invalid compressed data length: %d", len);
                ret = -EINVAL;
                break;
            }
4352
            decompress_data_with_multi_threads(f, host, len);
4353
            break;
4354

4355 4356 4357 4358 4359 4360 4361 4362 4363 4364
        case RAM_SAVE_FLAG_XBZRLE:
            if (load_xbzrle(f, addr, host) < 0) {
                error_report("Failed to decompress XBZRLE page at "
                             RAM_ADDR_FMT, addr);
                ret = -EINVAL;
                break;
            }
            break;
        case RAM_SAVE_FLAG_EOS:
            /* normal exit */
4365
            multifd_recv_sync_main();
4366 4367 4368
            break;
        default:
            if (flags & RAM_SAVE_FLAG_HOOK) {
4369
                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380
            } else {
                error_report("Unknown combination of migration flags: %#x",
                             flags);
                ret = -EINVAL;
            }
        }
        if (!ret) {
            ret = qemu_file_get_error(f);
        }
    }

4381
    ret |= wait_for_decompress_done();
4382
    rcu_read_unlock();
4383
    trace_ram_load_complete(ret, seq_iter);
4384 4385 4386 4387

    if (!ret  && migration_incoming_in_colo_state()) {
        colo_flush_ram_cache();
    }
4388 4389 4390
    return ret;
}

4391 4392
static bool ram_has_postcopy(void *opaque)
{
4393
    RAMBlock *rb;
4394
    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4395 4396 4397 4398 4399 4400 4401
        if (ramblock_is_pmem(rb)) {
            info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
                         "is not supported now!", rb->idstr, rb->host);
            return false;
        }
    }

4402 4403 4404
    return migrate_postcopy_ram();
}

4405 4406 4407 4408 4409 4410 4411 4412 4413
/* Sync all the dirty bitmap with destination VM.  */
static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
{
    RAMBlock *block;
    QEMUFile *file = s->to_dst_file;
    int ramblock_count = 0;

    trace_ram_dirty_bitmap_sync_start();

4414
    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436
        qemu_savevm_send_recv_bitmap(file, block->idstr);
        trace_ram_dirty_bitmap_request(block->idstr);
        ramblock_count++;
    }

    trace_ram_dirty_bitmap_sync_wait();

    /* Wait until all the ramblocks' dirty bitmap synced */
    while (ramblock_count--) {
        qemu_sem_wait(&s->rp_state.rp_sem);
    }

    trace_ram_dirty_bitmap_sync_complete();

    return 0;
}

static void ram_dirty_bitmap_reload_notify(MigrationState *s)
{
    qemu_sem_post(&s->rp_state.rp_sem);
}

4437 4438 4439 4440 4441 4442 4443 4444 4445 4446
/*
 * Read the received bitmap, revert it as the initial dirty bitmap.
 * This is only used when the postcopy migration is paused but wants
 * to resume from a middle point.
 */
int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
{
    int ret = -EINVAL;
    QEMUFile *file = s->rp_state.from_dst_file;
    unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4447
    uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510
    uint64_t size, end_mark;

    trace_ram_dirty_bitmap_reload_begin(block->idstr);

    if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
        error_report("%s: incorrect state %s", __func__,
                     MigrationStatus_str(s->state));
        return -EINVAL;
    }

    /*
     * Note: see comments in ramblock_recv_bitmap_send() on why we
     * need the endianess convertion, and the paddings.
     */
    local_size = ROUND_UP(local_size, 8);

    /* Add paddings */
    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);

    size = qemu_get_be64(file);

    /* The size of the bitmap should match with our ramblock */
    if (size != local_size) {
        error_report("%s: ramblock '%s' bitmap size mismatch "
                     "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
                     block->idstr, size, local_size);
        ret = -EINVAL;
        goto out;
    }

    size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
    end_mark = qemu_get_be64(file);

    ret = qemu_file_get_error(file);
    if (ret || size != local_size) {
        error_report("%s: read bitmap failed for ramblock '%s': %d"
                     " (size 0x%"PRIx64", got: 0x%"PRIx64")",
                     __func__, block->idstr, ret, local_size, size);
        ret = -EIO;
        goto out;
    }

    if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
        error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
                     __func__, block->idstr, end_mark);
        ret = -EINVAL;
        goto out;
    }

    /*
     * Endianess convertion. We are during postcopy (though paused).
     * The dirty bitmap won't change. We can directly modify it.
     */
    bitmap_from_le(block->bmap, le_bitmap, nbits);

    /*
     * What we received is "received bitmap". Revert it as the initial
     * dirty bitmap for this ramblock.
     */
    bitmap_complement(block->bmap, block->bmap, nbits);

    trace_ram_dirty_bitmap_reload_complete(block->idstr);

4511 4512 4513 4514 4515 4516
    /*
     * We succeeded to sync bitmap for current ramblock. If this is
     * the last one to sync, we need to notify the main send thread.
     */
    ram_dirty_bitmap_reload_notify(s);

4517 4518
    ret = 0;
out:
4519
    g_free(le_bitmap);
4520 4521 4522
    return ret;
}

4523 4524 4525
static int ram_resume_prepare(MigrationState *s, void *opaque)
{
    RAMState *rs = *(RAMState **)opaque;
P
Peter Xu 已提交
4526
    int ret;
4527

P
Peter Xu 已提交
4528 4529 4530 4531 4532 4533 4534 4535
    ret = ram_dirty_bitmap_sync_all(s, rs);
    if (ret) {
        return ret;
    }

    ram_state_resume_prepare(rs, s->to_dst_file);

    return 0;
4536 4537
}

4538
static SaveVMHandlers savevm_ram_handlers = {
4539
    .save_setup = ram_save_setup,
4540
    .save_live_iterate = ram_save_iterate,
4541
    .save_live_complete_postcopy = ram_save_complete,
4542
    .save_live_complete_precopy = ram_save_complete,
4543
    .has_postcopy = ram_has_postcopy,
4544 4545
    .save_live_pending = ram_save_pending,
    .load_state = ram_load,
4546 4547 4548
    .save_cleanup = ram_save_cleanup,
    .load_setup = ram_load_setup,
    .load_cleanup = ram_load_cleanup,
4549
    .resume_prepare = ram_resume_prepare,
4550 4551 4552 4553 4554
};

void ram_mig_init(void)
{
    qemu_mutex_init(&XBZRLE.lock);
J
Juan Quintela 已提交
4555
    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4556
}