postcopy-ram.c 24.6 KB
Newer Older
D
Dr. David Alan Gilbert 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Postcopy migration for RAM
 *
 * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
 *
 * Authors:
 *  Dave Gilbert  <dgilbert@redhat.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 *
 */

/*
 * Postcopy is a migration technique where the execution flips from the
 * source to the destination before all the data has been copied.
 */

P
Peter Maydell 已提交
19
#include "qemu/osdep.h"
D
Dr. David Alan Gilbert 已提交
20 21

#include "qemu-common.h"
22
#include "exec/target_page.h"
D
Dr. David Alan Gilbert 已提交
23
#include "migration/migration.h"
J
Juan Quintela 已提交
24
#include "qemu-file.h"
25
#include "savevm.h"
26
#include "postcopy-ram.h"
27
#include "ram.h"
D
Dr. David Alan Gilbert 已提交
28
#include "sysemu/sysemu.h"
29
#include "sysemu/balloon.h"
D
Dr. David Alan Gilbert 已提交
30 31 32
#include "qemu/error-report.h"
#include "trace.h"

33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
/* Arbitrary limit on size of each discard command,
 * keeps them around ~200 bytes
 */
#define MAX_DISCARDS_PER_COMMAND 12

struct PostcopyDiscardState {
    const char *ramblock_name;
    uint16_t cur_entry;
    /*
     * Start and length of a discard range (bytes)
     */
    uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
    uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
    unsigned int nsentwords;
    unsigned int nsentcmds;
};

D
Dr. David Alan Gilbert 已提交
50 51 52 53 54 55
/* Postcopy needs to detect accesses to pages that haven't yet been copied
 * across, and efficiently map new pages in, the techniques for doing this
 * are target OS specific.
 */
#if defined(__linux__)

56
#include <poll.h>
D
Dr. David Alan Gilbert 已提交
57 58 59 60 61
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <asm/types.h> /* for __u64 */
#endif

62 63
#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
#include <sys/eventfd.h>
D
Dr. David Alan Gilbert 已提交
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
#include <linux/userfaultfd.h>

static bool ufd_version_check(int ufd)
{
    struct uffdio_api api_struct;
    uint64_t ioctl_mask;

    api_struct.api = UFFD_API;
    api_struct.features = 0;
    if (ioctl(ufd, UFFDIO_API, &api_struct)) {
        error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s",
                     strerror(errno));
        return false;
    }

    ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
                 (__u64)1 << _UFFDIO_UNREGISTER;
    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
        error_report("Missing userfault features: %" PRIx64,
                     (uint64_t)(~api_struct.ioctls & ioctl_mask));
        return false;
    }

87 88 89 90 91 92 93 94 95 96 97
    if (getpagesize() != ram_pagesize_summary()) {
        bool have_hp = false;
        /* We've got a huge page */
#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
        have_hp = api_struct.features & UFFD_FEATURE_MISSING_HUGETLBFS;
#endif
        if (!have_hp) {
            error_report("Userfault on this host does not support huge pages");
            return false;
        }
    }
D
Dr. David Alan Gilbert 已提交
98 99 100
    return true;
}

101 102
/* Callback from postcopy_ram_supported_by_host block iterator.
 */
103
static int test_ramblock_postcopiable(const char *block_name, void *host_addr,
104 105
                             ram_addr_t offset, ram_addr_t length, void *opaque)
{
106 107 108 109
    RAMBlock *rb = qemu_ram_block_by_name(block_name);
    size_t pagesize = qemu_ram_pagesize(rb);

    if (qemu_ram_is_shared(rb)) {
110 111 112 113
        error_report("Postcopy on shared RAM (%s) is not yet supported",
                     block_name);
        return 1;
    }
114 115 116 117 118 119 120

    if (length % pagesize) {
        error_report("Postcopy requires RAM blocks to be a page size multiple,"
                     " block %s is 0x" RAM_ADDR_FMT " bytes with a "
                     "page size of 0x%zx", block_name, length, pagesize);
        return 1;
    }
121 122 123
    return 0;
}

124 125 126 127 128
/*
 * Note: This has the side effect of munlock'ing all of RAM, that's
 * normally fine since if the postcopy succeeds it gets turned back on at the
 * end.
 */
D
Dr. David Alan Gilbert 已提交
129 130 131 132 133 134 135 136 137 138
bool postcopy_ram_supported_by_host(void)
{
    long pagesize = getpagesize();
    int ufd = -1;
    bool ret = false; /* Error unless we change it */
    void *testarea = NULL;
    struct uffdio_register reg_struct;
    struct uffdio_range range_struct;
    uint64_t feature_mask;

139
    if (qemu_target_page_size() > pagesize) {
D
Dr. David Alan Gilbert 已提交
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
        error_report("Target page size bigger than host page size");
        goto out;
    }

    ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
    if (ufd == -1) {
        error_report("%s: userfaultfd not available: %s", __func__,
                     strerror(errno));
        goto out;
    }

    /* Version and features check */
    if (!ufd_version_check(ufd)) {
        goto out;
    }

156
    /* We don't support postcopy with shared RAM yet */
157
    if (qemu_ram_foreach_block(test_ramblock_postcopiable, NULL)) {
158 159 160
        goto out;
    }

161 162 163 164 165 166 167 168 169
    /*
     * userfault and mlock don't go together; we'll put it back later if
     * it was enabled.
     */
    if (munlockall()) {
        error_report("%s: munlockall: %s", __func__,  strerror(errno));
        return -1;
    }

D
Dr. David Alan Gilbert 已提交
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
    /*
     *  We need to check that the ops we need are supported on anon memory
     *  To do that we need to register a chunk and see the flags that
     *  are returned.
     */
    testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
                                    MAP_ANONYMOUS, -1, 0);
    if (testarea == MAP_FAILED) {
        error_report("%s: Failed to map test area: %s", __func__,
                     strerror(errno));
        goto out;
    }
    g_assert(((size_t)testarea & (pagesize-1)) == 0);

    reg_struct.range.start = (uintptr_t)testarea;
    reg_struct.range.len = pagesize;
    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;

    if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
        error_report("%s userfault register: %s", __func__, strerror(errno));
        goto out;
    }

    range_struct.start = (uintptr_t)testarea;
    range_struct.len = pagesize;
    if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
        error_report("%s userfault unregister: %s", __func__, strerror(errno));
        goto out;
    }

    feature_mask = (__u64)1 << _UFFDIO_WAKE |
                   (__u64)1 << _UFFDIO_COPY |
                   (__u64)1 << _UFFDIO_ZEROPAGE;
    if ((reg_struct.ioctls & feature_mask) != feature_mask) {
        error_report("Missing userfault map features: %" PRIx64,
                     (uint64_t)(~reg_struct.ioctls & feature_mask));
        goto out;
    }

    /* Success! */
    ret = true;
out:
    if (testarea) {
        munmap(testarea, pagesize);
    }
    if (ufd != -1) {
        close(ufd);
    }
    return ret;
}

221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
/*
 * Setup an area of RAM so that it *can* be used for postcopy later; this
 * must be done right at the start prior to pre-copy.
 * opaque should be the MIS.
 */
static int init_range(const char *block_name, void *host_addr,
                      ram_addr_t offset, ram_addr_t length, void *opaque)
{
    trace_postcopy_init_range(block_name, host_addr, offset, length);

    /*
     * We need the whole of RAM to be truly empty for postcopy, so things
     * like ROMs and any data tables built during init must be zero'd
     * - we're going to get the copy from the source anyway.
     * (Precopy will just overwrite this data, so doesn't need the discard)
     */
237
    if (ram_discard_range(block_name, 0, length)) {
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
        return -1;
    }

    return 0;
}

/*
 * At the end of migration, undo the effects of init_range
 * opaque should be the MIS.
 */
static int cleanup_range(const char *block_name, void *host_addr,
                        ram_addr_t offset, ram_addr_t length, void *opaque)
{
    MigrationIncomingState *mis = opaque;
    struct uffdio_range range_struct;
    trace_postcopy_cleanup_range(block_name, host_addr, offset, length);

    /*
     * We turned off hugepage for the precopy stage with postcopy enabled
     * we can turn it back on now.
     */
259
    qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284

    /*
     * We can also turn off userfault now since we should have all the
     * pages.   It can be useful to leave it on to debug postcopy
     * if you're not sure it's always getting every page.
     */
    range_struct.start = (uintptr_t)host_addr;
    range_struct.len = length;

    if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
        error_report("%s: userfault unregister %s", __func__, strerror(errno));

        return -1;
    }

    return 0;
}

/*
 * Initialise postcopy-ram, setting the RAM to a state where we can go into
 * postcopy later; must be called prior to any precopy.
 * called from arch_init's similarly named ram_postcopy_incoming_init
 */
int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
{
285
    if (qemu_ram_foreach_block(init_range, NULL)) {
286 287 288 289 290 291 292 293 294 295 296
        return -1;
    }

    return 0;
}

/*
 * At the end of a migration where postcopy_ram_incoming_init was called.
 */
int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
{
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
    trace_postcopy_ram_incoming_cleanup_entry();

    if (mis->have_fault_thread) {
        uint64_t tmp64;

        if (qemu_ram_foreach_block(cleanup_range, mis)) {
            return -1;
        }
        /*
         * Tell the fault_thread to exit, it's an eventfd that should
         * currently be at 0, we're going to increment it to 1
         */
        tmp64 = 1;
        if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) {
            trace_postcopy_ram_incoming_cleanup_join();
            qemu_thread_join(&mis->fault_thread);
        } else {
            /* Not much we can do here, but may as well report it */
            error_report("%s: incrementing userfault_quit_fd: %s", __func__,
                         strerror(errno));
        }
        trace_postcopy_ram_incoming_cleanup_closeuf();
        close(mis->userfault_fd);
        close(mis->userfault_quit_fd);
        mis->have_fault_thread = false;
322 323
    }

324 325
    qemu_balloon_inhibit(false);

326 327 328 329 330 331 332 333 334 335
    if (enable_mlock) {
        if (os_mlock() < 0) {
            error_report("mlock: %s", strerror(errno));
            /*
             * It doesn't feel right to fail at this point, we have a valid
             * VM state.
             */
        }
    }

336 337
    postcopy_state_set(POSTCOPY_INCOMING_END);

338
    if (mis->postcopy_tmp_page) {
339
        munmap(mis->postcopy_tmp_page, mis->largest_page_size);
340 341
        mis->postcopy_tmp_page = NULL;
    }
342 343 344 345
    if (mis->postcopy_tmp_zero_page) {
        munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
        mis->postcopy_tmp_zero_page = NULL;
    }
346
    trace_postcopy_ram_incoming_cleanup_exit();
347 348 349
    return 0;
}

350 351 352 353 354 355 356 357 358 359 360 361 362
/*
 * Disable huge pages on an area
 */
static int nhp_range(const char *block_name, void *host_addr,
                    ram_addr_t offset, ram_addr_t length, void *opaque)
{
    trace_postcopy_nhp_range(block_name, host_addr, offset, length);

    /*
     * Before we do discards we need to ensure those discards really
     * do delete areas of the page, even if THP thinks a hugepage would
     * be a good idea, so force hugepages off.
     */
363
    qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383

    return 0;
}

/*
 * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
 * however leaving it until after precopy means that most of the precopy
 * data is still THPd
 */
int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
{
    if (qemu_ram_foreach_block(nhp_range, mis)) {
        return -1;
    }

    postcopy_state_set(POSTCOPY_INCOMING_DISCARD);

    return 0;
}

384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
/*
 * Mark the given area of RAM as requiring notification to unwritten areas
 * Used as a  callback on qemu_ram_foreach_block.
 *   host_addr: Base of area to mark
 *   offset: Offset in the whole ram arena
 *   length: Length of the section
 *   opaque: MigrationIncomingState pointer
 * Returns 0 on success
 */
static int ram_block_enable_notify(const char *block_name, void *host_addr,
                                   ram_addr_t offset, ram_addr_t length,
                                   void *opaque)
{
    MigrationIncomingState *mis = opaque;
    struct uffdio_register reg_struct;

    reg_struct.range.start = (uintptr_t)host_addr;
    reg_struct.range.len = length;
    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;

    /* Now tell our userfault_fd that it's responsible for this area */
    if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
        error_report("%s userfault register: %s", __func__, strerror(errno));
        return -1;
    }
409 410 411 412
    if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
        error_report("%s userfault: Region doesn't support COPY", __func__);
        return -1;
    }
413 414 415 416 417 418 419 420 421 422

    return 0;
}

/*
 * Handle faults detected by the USERFAULT markings
 */
static void *postcopy_ram_fault_thread(void *opaque)
{
    MigrationIncomingState *mis = opaque;
423 424 425 426
    struct uffd_msg msg;
    int ret;
    RAMBlock *rb = NULL;
    RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */
427

428
    trace_postcopy_ram_fault_thread_entry();
429 430
    qemu_sem_post(&mis->fault_thread_sem);

431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483
    while (true) {
        ram_addr_t rb_offset;
        struct pollfd pfd[2];

        /*
         * We're mainly waiting for the kernel to give us a faulting HVA,
         * however we can be told to quit via userfault_quit_fd which is
         * an eventfd
         */
        pfd[0].fd = mis->userfault_fd;
        pfd[0].events = POLLIN;
        pfd[0].revents = 0;
        pfd[1].fd = mis->userfault_quit_fd;
        pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
        pfd[1].revents = 0;

        if (poll(pfd, 2, -1 /* Wait forever */) == -1) {
            error_report("%s: userfault poll: %s", __func__, strerror(errno));
            break;
        }

        if (pfd[1].revents) {
            trace_postcopy_ram_fault_thread_quit();
            break;
        }

        ret = read(mis->userfault_fd, &msg, sizeof(msg));
        if (ret != sizeof(msg)) {
            if (errno == EAGAIN) {
                /*
                 * if a wake up happens on the other thread just after
                 * the poll, there is nothing to read.
                 */
                continue;
            }
            if (ret < 0) {
                error_report("%s: Failed to read full userfault message: %s",
                             __func__, strerror(errno));
                break;
            } else {
                error_report("%s: Read %d bytes from userfaultfd expected %zd",
                             __func__, ret, sizeof(msg));
                break; /* Lost alignment, don't know what we'd read next */
            }
        }
        if (msg.event != UFFD_EVENT_PAGEFAULT) {
            error_report("%s: Read unexpected event %ud from userfaultfd",
                         __func__, msg.event);
            continue; /* It's not a page fault, shouldn't happen */
        }

        rb = qemu_ram_block_from_host(
                 (void *)(uintptr_t)msg.arg.pagefault.address,
484
                 true, &rb_offset);
485 486 487 488 489 490
        if (!rb) {
            error_report("postcopy_ram_fault_thread: Fault outside guest: %"
                         PRIx64, (uint64_t)msg.arg.pagefault.address);
            break;
        }

491
        rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
492 493 494 495 496 497 498 499 500 501 502
        trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
                                                qemu_ram_get_idstr(rb),
                                                rb_offset);

        /*
         * Send the request to the source - we want to request one
         * of our host page sizes (which is >= TPS)
         */
        if (rb != last_rb) {
            last_rb = rb;
            migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
503
                                     rb_offset, qemu_ram_pagesize(rb));
504 505 506
        } else {
            /* Save some space */
            migrate_send_rp_req_pages(mis, NULL,
507
                                     rb_offset, qemu_ram_pagesize(rb));
508 509 510
        }
    }
    trace_postcopy_ram_fault_thread_exit();
511 512 513 514 515
    return NULL;
}

int postcopy_ram_enable_notify(MigrationIncomingState *mis)
{
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
    /* Open the fd for the kernel to give us userfaults */
    mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (mis->userfault_fd == -1) {
        error_report("%s: Failed to open userfault fd: %s", __func__,
                     strerror(errno));
        return -1;
    }

    /*
     * Although the host check already tested the API, we need to
     * do the check again as an ABI handshake on the new fd.
     */
    if (!ufd_version_check(mis->userfault_fd)) {
        return -1;
    }

    /* Now an eventfd we use to tell the fault-thread to quit */
    mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC);
    if (mis->userfault_quit_fd == -1) {
        error_report("%s: Opening userfault_quit_fd: %s", __func__,
                     strerror(errno));
        close(mis->userfault_fd);
        return -1;
    }

541 542 543 544 545
    qemu_sem_init(&mis->fault_thread_sem, 0);
    qemu_thread_create(&mis->fault_thread, "postcopy/fault",
                       postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
    qemu_sem_wait(&mis->fault_thread_sem);
    qemu_sem_destroy(&mis->fault_thread_sem);
546
    mis->have_fault_thread = true;
547 548 549 550 551 552

    /* Mark so that we get notified of accesses to unwritten areas */
    if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
        return -1;
    }

553 554 555 556 557 558
    /*
     * Ballooning can mark pages as absent while we're postcopying
     * that would cause false userfaults.
     */
    qemu_balloon_inhibit(true);

559 560
    trace_postcopy_ram_enable_notify();

561 562 563
    return 0;
}

564 565 566 567
/*
 * Place a host page (from) at (host) atomically
 * returns 0 on success
 */
568 569
int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
                        size_t pagesize)
570 571 572 573 574
{
    struct uffdio_copy copy_struct;

    copy_struct.dst = (uint64_t)(uintptr_t)host;
    copy_struct.src = (uint64_t)(uintptr_t)from;
575
    copy_struct.len = pagesize;
576 577 578 579 580 581 582 583 584
    copy_struct.mode = 0;

    /* copy also acks to the kernel waking the stalled thread up
     * TODO: We can inhibit that ack and only do it if it was requested
     * which would be slightly cheaper, but we'd have to be careful
     * of the order of updating our page state.
     */
    if (ioctl(mis->userfault_fd, UFFDIO_COPY, &copy_struct)) {
        int e = errno;
585 586
        error_report("%s: %s copy host: %p from: %p (size: %zd)",
                     __func__, strerror(e), host, from, pagesize);
587 588 589 590 591 592 593 594 595 596 597 598

        return -e;
    }

    trace_postcopy_place_page(host);
    return 0;
}

/*
 * Place a zero page at (host) atomically
 * returns 0 on success
 */
599 600
int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
                             size_t pagesize)
601
{
602
    trace_postcopy_place_page_zero(host);
603

604 605 606 607 608
    if (pagesize == getpagesize()) {
        struct uffdio_zeropage zero_struct;
        zero_struct.range.start = (uint64_t)(uintptr_t)host;
        zero_struct.range.len = getpagesize();
        zero_struct.mode = 0;
609

610 611 612 613
        if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, &zero_struct)) {
            int e = errno;
            error_report("%s: %s zero host: %p",
                         __func__, strerror(e), host);
614

615 616 617
            return -e;
        }
    } else {
618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634
        /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */
        if (!mis->postcopy_tmp_zero_page) {
            mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
                                               PROT_READ | PROT_WRITE,
                                               MAP_PRIVATE | MAP_ANONYMOUS,
                                               -1, 0);
            if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
                int e = errno;
                mis->postcopy_tmp_zero_page = NULL;
                error_report("%s: %s mapping large zero page",
                             __func__, strerror(e));
                return -e;
            }
            memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
        }
        return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
                                   pagesize);
635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
    }

    return 0;
}

/*
 * Returns a target page of memory that can be mapped at a later point in time
 * using postcopy_place_page
 * The same address is used repeatedly, postcopy_place_page just takes the
 * backing page away.
 * Returns: Pointer to allocated page
 *
 */
void *postcopy_get_tmp_page(MigrationIncomingState *mis)
{
    if (!mis->postcopy_tmp_page) {
651
        mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
652 653
                             PROT_READ | PROT_WRITE, MAP_PRIVATE |
                             MAP_ANONYMOUS, -1, 0);
654 655
        if (mis->postcopy_tmp_page == MAP_FAILED) {
            mis->postcopy_tmp_page = NULL;
656 657 658 659 660 661 662 663
            error_report("%s: %s", __func__, strerror(errno));
            return NULL;
        }
    }

    return mis->postcopy_tmp_page;
}

D
Dr. David Alan Gilbert 已提交
664 665 666 667 668 669 670 671
#else
/* No target OS support, stubs just fail */
bool postcopy_ram_supported_by_host(void)
{
    error_report("%s: No OS support", __func__);
    return false;
}

672 673 674 675 676 677 678 679 680 681 682 683
int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
{
    error_report("postcopy_ram_incoming_init: No OS support");
    return -1;
}

int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
{
    assert(0);
    return -1;
}

684 685 686 687 688 689
int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
{
    assert(0);
    return -1;
}

690 691 692 693 694
int postcopy_ram_enable_notify(MigrationIncomingState *mis)
{
    assert(0);
    return -1;
}
695

696 697
int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
                        size_t pagesize)
698 699 700 701 702
{
    assert(0);
    return -1;
}

703 704
int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
                        size_t pagesize)
705 706 707 708 709 710 711 712 713 714 715
{
    assert(0);
    return -1;
}

void *postcopy_get_tmp_page(MigrationIncomingState *mis)
{
    assert(0);
    return NULL;
}

D
Dr. David Alan Gilbert 已提交
716 717
#endif

718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755
/* ------------------------------------------------------------------------- */

/**
 * postcopy_discard_send_init: Called at the start of each RAMBlock before
 *   asking to discard individual ranges.
 *
 * @ms: The current migration state.
 * @offset: the bitmap offset of the named RAMBlock in the migration
 *   bitmap.
 * @name: RAMBlock that discards will operate on.
 *
 * returns: a new PDS.
 */
PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
                                                 const char *name)
{
    PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));

    if (res) {
        res->ramblock_name = name;
    }

    return res;
}

/**
 * postcopy_discard_send_range: Called by the bitmap code for each chunk to
 *   discard. May send a discard message, may just leave it queued to
 *   be sent later.
 *
 * @ms: Current migration state.
 * @pds: Structure initialised by postcopy_discard_send_init().
 * @start,@length: a range of pages in the migration bitmap in the
 *   RAM block passed to postcopy_discard_send_init() (length=1 is one page)
 */
void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
                                unsigned long start, unsigned long length)
{
756
    size_t tp_size = qemu_target_page_size();
757
    /* Convert to byte offsets within the RAM block */
758
    pds->start_list[pds->cur_entry] = start  * tp_size;
759
    pds->length_list[pds->cur_entry] = length * tp_size;
760 761 762 763 764 765
    trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
    pds->cur_entry++;
    pds->nsentwords++;

    if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
        /* Full set, ship it! */
766 767
        qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
                                              pds->ramblock_name,
768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
                                              pds->cur_entry,
                                              pds->start_list,
                                              pds->length_list);
        pds->nsentcmds++;
        pds->cur_entry = 0;
    }
}

/**
 * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
 * bitmap code. Sends any outstanding discard messages, frees the PDS
 *
 * @ms: Current migration state.
 * @pds: Structure initialised by postcopy_discard_send_init().
 */
void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
{
    /* Anything unsent? */
    if (pds->cur_entry) {
787 788
        qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
                                              pds->ramblock_name,
789 790 791 792 793 794 795 796 797 798 799
                                              pds->cur_entry,
                                              pds->start_list,
                                              pds->length_list);
        pds->nsentcmds++;
    }

    trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
                                       pds->nsentcmds);

    g_free(pds);
}
800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817

/*
 * Current state of incoming postcopy; note this is not part of
 * MigrationIncomingState since it's state is used during cleanup
 * at the end as MIS is being freed.
 */
static PostcopyState incoming_postcopy_state;

PostcopyState  postcopy_state_get(void)
{
    return atomic_mb_read(&incoming_postcopy_state);
}

/* Set the state and return the old state */
PostcopyState postcopy_state_set(PostcopyState new_state)
{
    return atomic_xchg(&incoming_postcopy_state, new_state);
}