postcopy-ram.c 24.6 KB
Newer Older
D
Dr. David Alan Gilbert 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Postcopy migration for RAM
 *
 * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
 *
 * Authors:
 *  Dave Gilbert  <dgilbert@redhat.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 *
 */

/*
 * Postcopy is a migration technique where the execution flips from the
 * source to the destination before all the data has been copied.
 */

P
Peter Maydell 已提交
19
#include "qemu/osdep.h"
D
Dr. David Alan Gilbert 已提交
20 21 22

#include "qemu-common.h"
#include "migration/migration.h"
23
#include "migration/qemu-file.h"
24
#include "postcopy-ram.h"
D
Dr. David Alan Gilbert 已提交
25
#include "sysemu/sysemu.h"
26
#include "sysemu/balloon.h"
D
Dr. David Alan Gilbert 已提交
27 28 29
#include "qemu/error-report.h"
#include "trace.h"

30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
/* Arbitrary limit on size of each discard command,
 * keeps them around ~200 bytes
 */
#define MAX_DISCARDS_PER_COMMAND 12

struct PostcopyDiscardState {
    const char *ramblock_name;
    uint16_t cur_entry;
    /*
     * Start and length of a discard range (bytes)
     */
    uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
    uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
    unsigned int nsentwords;
    unsigned int nsentcmds;
};

D
Dr. David Alan Gilbert 已提交
47 48 49 50 51 52
/* Postcopy needs to detect accesses to pages that haven't yet been copied
 * across, and efficiently map new pages in, the techniques for doing this
 * are target OS specific.
 */
#if defined(__linux__)

53
#include <poll.h>
D
Dr. David Alan Gilbert 已提交
54 55 56 57 58
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <asm/types.h> /* for __u64 */
#endif

59 60
#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
#include <sys/eventfd.h>
D
Dr. David Alan Gilbert 已提交
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
#include <linux/userfaultfd.h>

static bool ufd_version_check(int ufd)
{
    struct uffdio_api api_struct;
    uint64_t ioctl_mask;

    api_struct.api = UFFD_API;
    api_struct.features = 0;
    if (ioctl(ufd, UFFDIO_API, &api_struct)) {
        error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s",
                     strerror(errno));
        return false;
    }

    ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
                 (__u64)1 << _UFFDIO_UNREGISTER;
    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
        error_report("Missing userfault features: %" PRIx64,
                     (uint64_t)(~api_struct.ioctls & ioctl_mask));
        return false;
    }

84 85 86 87 88 89 90 91 92 93 94
    if (getpagesize() != ram_pagesize_summary()) {
        bool have_hp = false;
        /* We've got a huge page */
#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
        have_hp = api_struct.features & UFFD_FEATURE_MISSING_HUGETLBFS;
#endif
        if (!have_hp) {
            error_report("Userfault on this host does not support huge pages");
            return false;
        }
    }
D
Dr. David Alan Gilbert 已提交
95 96 97
    return true;
}

98 99
/* Callback from postcopy_ram_supported_by_host block iterator.
 */
100
static int test_ramblock_postcopiable(const char *block_name, void *host_addr,
101 102
                             ram_addr_t offset, ram_addr_t length, void *opaque)
{
103 104 105 106
    RAMBlock *rb = qemu_ram_block_by_name(block_name);
    size_t pagesize = qemu_ram_pagesize(rb);

    if (qemu_ram_is_shared(rb)) {
107 108 109 110
        error_report("Postcopy on shared RAM (%s) is not yet supported",
                     block_name);
        return 1;
    }
111 112 113 114 115 116 117

    if (length % pagesize) {
        error_report("Postcopy requires RAM blocks to be a page size multiple,"
                     " block %s is 0x" RAM_ADDR_FMT " bytes with a "
                     "page size of 0x%zx", block_name, length, pagesize);
        return 1;
    }
118 119 120
    return 0;
}

121 122 123 124 125
/*
 * Note: This has the side effect of munlock'ing all of RAM, that's
 * normally fine since if the postcopy succeeds it gets turned back on at the
 * end.
 */
D
Dr. David Alan Gilbert 已提交
126 127 128 129 130 131 132 133 134 135
bool postcopy_ram_supported_by_host(void)
{
    long pagesize = getpagesize();
    int ufd = -1;
    bool ret = false; /* Error unless we change it */
    void *testarea = NULL;
    struct uffdio_register reg_struct;
    struct uffdio_range range_struct;
    uint64_t feature_mask;

136
    if (qemu_target_page_size() > pagesize) {
D
Dr. David Alan Gilbert 已提交
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
        error_report("Target page size bigger than host page size");
        goto out;
    }

    ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
    if (ufd == -1) {
        error_report("%s: userfaultfd not available: %s", __func__,
                     strerror(errno));
        goto out;
    }

    /* Version and features check */
    if (!ufd_version_check(ufd)) {
        goto out;
    }

153
    /* We don't support postcopy with shared RAM yet */
154
    if (qemu_ram_foreach_block(test_ramblock_postcopiable, NULL)) {
155 156 157
        goto out;
    }

158 159 160 161 162 163 164 165 166
    /*
     * userfault and mlock don't go together; we'll put it back later if
     * it was enabled.
     */
    if (munlockall()) {
        error_report("%s: munlockall: %s", __func__,  strerror(errno));
        return -1;
    }

D
Dr. David Alan Gilbert 已提交
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
    /*
     *  We need to check that the ops we need are supported on anon memory
     *  To do that we need to register a chunk and see the flags that
     *  are returned.
     */
    testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
                                    MAP_ANONYMOUS, -1, 0);
    if (testarea == MAP_FAILED) {
        error_report("%s: Failed to map test area: %s", __func__,
                     strerror(errno));
        goto out;
    }
    g_assert(((size_t)testarea & (pagesize-1)) == 0);

    reg_struct.range.start = (uintptr_t)testarea;
    reg_struct.range.len = pagesize;
    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;

    if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
        error_report("%s userfault register: %s", __func__, strerror(errno));
        goto out;
    }

    range_struct.start = (uintptr_t)testarea;
    range_struct.len = pagesize;
    if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
        error_report("%s userfault unregister: %s", __func__, strerror(errno));
        goto out;
    }

    feature_mask = (__u64)1 << _UFFDIO_WAKE |
                   (__u64)1 << _UFFDIO_COPY |
                   (__u64)1 << _UFFDIO_ZEROPAGE;
    if ((reg_struct.ioctls & feature_mask) != feature_mask) {
        error_report("Missing userfault map features: %" PRIx64,
                     (uint64_t)(~reg_struct.ioctls & feature_mask));
        goto out;
    }

    /* Success! */
    ret = true;
out:
    if (testarea) {
        munmap(testarea, pagesize);
    }
    if (ufd != -1) {
        close(ufd);
    }
    return ret;
}

218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
/*
 * Setup an area of RAM so that it *can* be used for postcopy later; this
 * must be done right at the start prior to pre-copy.
 * opaque should be the MIS.
 */
static int init_range(const char *block_name, void *host_addr,
                      ram_addr_t offset, ram_addr_t length, void *opaque)
{
    trace_postcopy_init_range(block_name, host_addr, offset, length);

    /*
     * We need the whole of RAM to be truly empty for postcopy, so things
     * like ROMs and any data tables built during init must be zero'd
     * - we're going to get the copy from the source anyway.
     * (Precopy will just overwrite this data, so doesn't need the discard)
     */
234
    if (ram_discard_range(block_name, 0, length)) {
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
        return -1;
    }

    return 0;
}

/*
 * At the end of migration, undo the effects of init_range
 * opaque should be the MIS.
 */
static int cleanup_range(const char *block_name, void *host_addr,
                        ram_addr_t offset, ram_addr_t length, void *opaque)
{
    MigrationIncomingState *mis = opaque;
    struct uffdio_range range_struct;
    trace_postcopy_cleanup_range(block_name, host_addr, offset, length);

    /*
     * We turned off hugepage for the precopy stage with postcopy enabled
     * we can turn it back on now.
     */
256
    qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281

    /*
     * We can also turn off userfault now since we should have all the
     * pages.   It can be useful to leave it on to debug postcopy
     * if you're not sure it's always getting every page.
     */
    range_struct.start = (uintptr_t)host_addr;
    range_struct.len = length;

    if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
        error_report("%s: userfault unregister %s", __func__, strerror(errno));

        return -1;
    }

    return 0;
}

/*
 * Initialise postcopy-ram, setting the RAM to a state where we can go into
 * postcopy later; must be called prior to any precopy.
 * called from arch_init's similarly named ram_postcopy_incoming_init
 */
int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
{
282
    if (qemu_ram_foreach_block(init_range, NULL)) {
283 284 285 286 287 288 289 290 291 292 293
        return -1;
    }

    return 0;
}

/*
 * At the end of a migration where postcopy_ram_incoming_init was called.
 */
int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
{
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
    trace_postcopy_ram_incoming_cleanup_entry();

    if (mis->have_fault_thread) {
        uint64_t tmp64;

        if (qemu_ram_foreach_block(cleanup_range, mis)) {
            return -1;
        }
        /*
         * Tell the fault_thread to exit, it's an eventfd that should
         * currently be at 0, we're going to increment it to 1
         */
        tmp64 = 1;
        if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) {
            trace_postcopy_ram_incoming_cleanup_join();
            qemu_thread_join(&mis->fault_thread);
        } else {
            /* Not much we can do here, but may as well report it */
            error_report("%s: incrementing userfault_quit_fd: %s", __func__,
                         strerror(errno));
        }
        trace_postcopy_ram_incoming_cleanup_closeuf();
        close(mis->userfault_fd);
        close(mis->userfault_quit_fd);
        mis->have_fault_thread = false;
319 320
    }

321 322
    qemu_balloon_inhibit(false);

323 324 325 326 327 328 329 330 331 332
    if (enable_mlock) {
        if (os_mlock() < 0) {
            error_report("mlock: %s", strerror(errno));
            /*
             * It doesn't feel right to fail at this point, we have a valid
             * VM state.
             */
        }
    }

333 334 335
    postcopy_state_set(POSTCOPY_INCOMING_END);
    migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);

336
    if (mis->postcopy_tmp_page) {
337
        munmap(mis->postcopy_tmp_page, mis->largest_page_size);
338 339
        mis->postcopy_tmp_page = NULL;
    }
340 341 342 343
    if (mis->postcopy_tmp_zero_page) {
        munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
        mis->postcopy_tmp_zero_page = NULL;
    }
344
    trace_postcopy_ram_incoming_cleanup_exit();
345 346 347
    return 0;
}

348 349 350 351 352 353 354 355 356 357 358 359 360
/*
 * Disable huge pages on an area
 */
static int nhp_range(const char *block_name, void *host_addr,
                    ram_addr_t offset, ram_addr_t length, void *opaque)
{
    trace_postcopy_nhp_range(block_name, host_addr, offset, length);

    /*
     * Before we do discards we need to ensure those discards really
     * do delete areas of the page, even if THP thinks a hugepage would
     * be a good idea, so force hugepages off.
     */
361
    qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381

    return 0;
}

/*
 * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
 * however leaving it until after precopy means that most of the precopy
 * data is still THPd
 */
int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
{
    if (qemu_ram_foreach_block(nhp_range, mis)) {
        return -1;
    }

    postcopy_state_set(POSTCOPY_INCOMING_DISCARD);

    return 0;
}

382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
/*
 * Mark the given area of RAM as requiring notification to unwritten areas
 * Used as a  callback on qemu_ram_foreach_block.
 *   host_addr: Base of area to mark
 *   offset: Offset in the whole ram arena
 *   length: Length of the section
 *   opaque: MigrationIncomingState pointer
 * Returns 0 on success
 */
static int ram_block_enable_notify(const char *block_name, void *host_addr,
                                   ram_addr_t offset, ram_addr_t length,
                                   void *opaque)
{
    MigrationIncomingState *mis = opaque;
    struct uffdio_register reg_struct;

    reg_struct.range.start = (uintptr_t)host_addr;
    reg_struct.range.len = length;
    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;

    /* Now tell our userfault_fd that it's responsible for this area */
    if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
        error_report("%s userfault register: %s", __func__, strerror(errno));
        return -1;
    }
407 408 409 410
    if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
        error_report("%s userfault: Region doesn't support COPY", __func__);
        return -1;
    }
411 412 413 414 415 416 417 418 419 420

    return 0;
}

/*
 * Handle faults detected by the USERFAULT markings
 */
static void *postcopy_ram_fault_thread(void *opaque)
{
    MigrationIncomingState *mis = opaque;
421 422 423 424
    struct uffd_msg msg;
    int ret;
    RAMBlock *rb = NULL;
    RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */
425

426
    trace_postcopy_ram_fault_thread_entry();
427 428
    qemu_sem_post(&mis->fault_thread_sem);

429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481
    while (true) {
        ram_addr_t rb_offset;
        struct pollfd pfd[2];

        /*
         * We're mainly waiting for the kernel to give us a faulting HVA,
         * however we can be told to quit via userfault_quit_fd which is
         * an eventfd
         */
        pfd[0].fd = mis->userfault_fd;
        pfd[0].events = POLLIN;
        pfd[0].revents = 0;
        pfd[1].fd = mis->userfault_quit_fd;
        pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
        pfd[1].revents = 0;

        if (poll(pfd, 2, -1 /* Wait forever */) == -1) {
            error_report("%s: userfault poll: %s", __func__, strerror(errno));
            break;
        }

        if (pfd[1].revents) {
            trace_postcopy_ram_fault_thread_quit();
            break;
        }

        ret = read(mis->userfault_fd, &msg, sizeof(msg));
        if (ret != sizeof(msg)) {
            if (errno == EAGAIN) {
                /*
                 * if a wake up happens on the other thread just after
                 * the poll, there is nothing to read.
                 */
                continue;
            }
            if (ret < 0) {
                error_report("%s: Failed to read full userfault message: %s",
                             __func__, strerror(errno));
                break;
            } else {
                error_report("%s: Read %d bytes from userfaultfd expected %zd",
                             __func__, ret, sizeof(msg));
                break; /* Lost alignment, don't know what we'd read next */
            }
        }
        if (msg.event != UFFD_EVENT_PAGEFAULT) {
            error_report("%s: Read unexpected event %ud from userfaultfd",
                         __func__, msg.event);
            continue; /* It's not a page fault, shouldn't happen */
        }

        rb = qemu_ram_block_from_host(
                 (void *)(uintptr_t)msg.arg.pagefault.address,
482
                 true, &rb_offset);
483 484 485 486 487 488
        if (!rb) {
            error_report("postcopy_ram_fault_thread: Fault outside guest: %"
                         PRIx64, (uint64_t)msg.arg.pagefault.address);
            break;
        }

489
        rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
490 491 492 493 494 495 496 497 498 499 500
        trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
                                                qemu_ram_get_idstr(rb),
                                                rb_offset);

        /*
         * Send the request to the source - we want to request one
         * of our host page sizes (which is >= TPS)
         */
        if (rb != last_rb) {
            last_rb = rb;
            migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
501
                                     rb_offset, qemu_ram_pagesize(rb));
502 503 504
        } else {
            /* Save some space */
            migrate_send_rp_req_pages(mis, NULL,
505
                                     rb_offset, qemu_ram_pagesize(rb));
506 507 508
        }
    }
    trace_postcopy_ram_fault_thread_exit();
509 510 511 512 513
    return NULL;
}

int postcopy_ram_enable_notify(MigrationIncomingState *mis)
{
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538
    /* Open the fd for the kernel to give us userfaults */
    mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (mis->userfault_fd == -1) {
        error_report("%s: Failed to open userfault fd: %s", __func__,
                     strerror(errno));
        return -1;
    }

    /*
     * Although the host check already tested the API, we need to
     * do the check again as an ABI handshake on the new fd.
     */
    if (!ufd_version_check(mis->userfault_fd)) {
        return -1;
    }

    /* Now an eventfd we use to tell the fault-thread to quit */
    mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC);
    if (mis->userfault_quit_fd == -1) {
        error_report("%s: Opening userfault_quit_fd: %s", __func__,
                     strerror(errno));
        close(mis->userfault_fd);
        return -1;
    }

539 540 541 542 543
    qemu_sem_init(&mis->fault_thread_sem, 0);
    qemu_thread_create(&mis->fault_thread, "postcopy/fault",
                       postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
    qemu_sem_wait(&mis->fault_thread_sem);
    qemu_sem_destroy(&mis->fault_thread_sem);
544
    mis->have_fault_thread = true;
545 546 547 548 549 550

    /* Mark so that we get notified of accesses to unwritten areas */
    if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
        return -1;
    }

551 552 553 554 555 556
    /*
     * Ballooning can mark pages as absent while we're postcopying
     * that would cause false userfaults.
     */
    qemu_balloon_inhibit(true);

557 558
    trace_postcopy_ram_enable_notify();

559 560 561
    return 0;
}

562 563 564 565
/*
 * Place a host page (from) at (host) atomically
 * returns 0 on success
 */
566 567
int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
                        size_t pagesize)
568 569 570 571 572
{
    struct uffdio_copy copy_struct;

    copy_struct.dst = (uint64_t)(uintptr_t)host;
    copy_struct.src = (uint64_t)(uintptr_t)from;
573
    copy_struct.len = pagesize;
574 575 576 577 578 579 580 581 582
    copy_struct.mode = 0;

    /* copy also acks to the kernel waking the stalled thread up
     * TODO: We can inhibit that ack and only do it if it was requested
     * which would be slightly cheaper, but we'd have to be careful
     * of the order of updating our page state.
     */
    if (ioctl(mis->userfault_fd, UFFDIO_COPY, &copy_struct)) {
        int e = errno;
583 584
        error_report("%s: %s copy host: %p from: %p (size: %zd)",
                     __func__, strerror(e), host, from, pagesize);
585 586 587 588 589 590 591 592 593 594 595 596

        return -e;
    }

    trace_postcopy_place_page(host);
    return 0;
}

/*
 * Place a zero page at (host) atomically
 * returns 0 on success
 */
597 598
int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
                             size_t pagesize)
599
{
600
    trace_postcopy_place_page_zero(host);
601

602 603 604 605 606
    if (pagesize == getpagesize()) {
        struct uffdio_zeropage zero_struct;
        zero_struct.range.start = (uint64_t)(uintptr_t)host;
        zero_struct.range.len = getpagesize();
        zero_struct.mode = 0;
607

608 609 610 611
        if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, &zero_struct)) {
            int e = errno;
            error_report("%s: %s zero host: %p",
                         __func__, strerror(e), host);
612

613 614 615
            return -e;
        }
    } else {
616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632
        /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */
        if (!mis->postcopy_tmp_zero_page) {
            mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
                                               PROT_READ | PROT_WRITE,
                                               MAP_PRIVATE | MAP_ANONYMOUS,
                                               -1, 0);
            if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
                int e = errno;
                mis->postcopy_tmp_zero_page = NULL;
                error_report("%s: %s mapping large zero page",
                             __func__, strerror(e));
                return -e;
            }
            memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
        }
        return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
                                   pagesize);
633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
    }

    return 0;
}

/*
 * Returns a target page of memory that can be mapped at a later point in time
 * using postcopy_place_page
 * The same address is used repeatedly, postcopy_place_page just takes the
 * backing page away.
 * Returns: Pointer to allocated page
 *
 */
void *postcopy_get_tmp_page(MigrationIncomingState *mis)
{
    if (!mis->postcopy_tmp_page) {
649
        mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
650 651
                             PROT_READ | PROT_WRITE, MAP_PRIVATE |
                             MAP_ANONYMOUS, -1, 0);
652 653
        if (mis->postcopy_tmp_page == MAP_FAILED) {
            mis->postcopy_tmp_page = NULL;
654 655 656 657 658 659 660 661
            error_report("%s: %s", __func__, strerror(errno));
            return NULL;
        }
    }

    return mis->postcopy_tmp_page;
}

D
Dr. David Alan Gilbert 已提交
662 663 664 665 666 667 668 669
#else
/* No target OS support, stubs just fail */
bool postcopy_ram_supported_by_host(void)
{
    error_report("%s: No OS support", __func__);
    return false;
}

670 671 672 673 674 675 676 677 678 679 680 681
int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
{
    error_report("postcopy_ram_incoming_init: No OS support");
    return -1;
}

int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
{
    assert(0);
    return -1;
}

682 683 684 685 686 687
int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
{
    assert(0);
    return -1;
}

688 689 690 691 692
int postcopy_ram_enable_notify(MigrationIncomingState *mis)
{
    assert(0);
    return -1;
}
693

694 695
int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
                        size_t pagesize)
696 697 698 699 700
{
    assert(0);
    return -1;
}

701 702
int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
                        size_t pagesize)
703 704 705 706 707 708 709 710 711 712 713
{
    assert(0);
    return -1;
}

void *postcopy_get_tmp_page(MigrationIncomingState *mis)
{
    assert(0);
    return NULL;
}

D
Dr. David Alan Gilbert 已提交
714 715
#endif

716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753
/* ------------------------------------------------------------------------- */

/**
 * postcopy_discard_send_init: Called at the start of each RAMBlock before
 *   asking to discard individual ranges.
 *
 * @ms: The current migration state.
 * @offset: the bitmap offset of the named RAMBlock in the migration
 *   bitmap.
 * @name: RAMBlock that discards will operate on.
 *
 * returns: a new PDS.
 */
PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
                                                 const char *name)
{
    PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));

    if (res) {
        res->ramblock_name = name;
    }

    return res;
}

/**
 * postcopy_discard_send_range: Called by the bitmap code for each chunk to
 *   discard. May send a discard message, may just leave it queued to
 *   be sent later.
 *
 * @ms: Current migration state.
 * @pds: Structure initialised by postcopy_discard_send_init().
 * @start,@length: a range of pages in the migration bitmap in the
 *   RAM block passed to postcopy_discard_send_init() (length=1 is one page)
 */
void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
                                unsigned long start, unsigned long length)
{
754
    size_t tp_size = qemu_target_page_size();
755
    /* Convert to byte offsets within the RAM block */
756
    pds->start_list[pds->cur_entry] = start  * tp_size;
757
    pds->length_list[pds->cur_entry] = length * tp_size;
758 759 760 761 762 763
    trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
    pds->cur_entry++;
    pds->nsentwords++;

    if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
        /* Full set, ship it! */
764 765
        qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
                                              pds->ramblock_name,
766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
                                              pds->cur_entry,
                                              pds->start_list,
                                              pds->length_list);
        pds->nsentcmds++;
        pds->cur_entry = 0;
    }
}

/**
 * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
 * bitmap code. Sends any outstanding discard messages, frees the PDS
 *
 * @ms: Current migration state.
 * @pds: Structure initialised by postcopy_discard_send_init().
 */
void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
{
    /* Anything unsent? */
    if (pds->cur_entry) {
785 786
        qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
                                              pds->ramblock_name,
787 788 789 790 791 792 793 794 795 796 797
                                              pds->cur_entry,
                                              pds->start_list,
                                              pds->length_list);
        pds->nsentcmds++;
    }

    trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
                                       pds->nsentcmds);

    g_free(pds);
}
798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815

/*
 * Current state of incoming postcopy; note this is not part of
 * MigrationIncomingState since it's state is used during cleanup
 * at the end as MIS is being freed.
 */
static PostcopyState incoming_postcopy_state;

PostcopyState  postcopy_state_get(void)
{
    return atomic_mb_read(&incoming_postcopy_state);
}

/* Set the state and return the old state */
PostcopyState postcopy_state_set(PostcopyState new_state)
{
    return atomic_xchg(&incoming_postcopy_state, new_state);
}