kvm-all.c 64.0 KB
Newer Older
A
aliguori 已提交
1 2 3 4
/*
 * QEMU KVM support
 *
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
A
aliguori 已提交
6 7 8
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
A
aliguori 已提交
10 11 12 13 14 15
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
16
#include "qemu/osdep.h"
A
aliguori 已提交
17 18 19 20 21 22
#include <sys/ioctl.h>
#include <sys/mman.h>

#include <linux/kvm.h>

#include "qemu-common.h"
23 24 25
#include "qemu/atomic.h"
#include "qemu/option.h"
#include "qemu/config-file.h"
26
#include "qemu/error-report.h"
J
Jan Kiszka 已提交
27
#include "hw/hw.h"
28
#include "hw/pci/msi.h"
29
#include "hw/s390x/adapter.h"
30
#include "exec/gdbstub.h"
31
#include "sysemu/kvm_int.h"
32
#include "qemu/bswap.h"
33
#include "exec/memory.h"
34
#include "exec/ram_addr.h"
35
#include "exec/address-spaces.h"
36
#include "qemu/event_notifier.h"
37
#include "trace.h"
38
#include "hw/irq.h"
A
aliguori 已提交
39

40 41
#include "hw/boards.h"

42 43 44 45 46
/* This check must be after config-host.h is included */
#ifdef CONFIG_EVENTFD
#include <sys/eventfd.h>
#endif

47 48 49 50
/* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
 * need to use the real host PAGE_SIZE, as that's what KVM will use.
 */
#define PAGE_SIZE getpagesize()
A
aliguori 已提交
51

A
aliguori 已提交
52 53 54
//#define DEBUG_KVM

#ifdef DEBUG_KVM
55
#define DPRINTF(fmt, ...) \
A
aliguori 已提交
56 57
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
#else
58
#define DPRINTF(fmt, ...) \
A
aliguori 已提交
59 60 61
    do { } while (0)
#endif

62 63
#define KVM_MSI_HASHTAB_SIZE    256

G
Gu Zheng 已提交
64 65 66 67 68 69
struct KVMParkedVcpu {
    unsigned long vcpu_id;
    int kvm_fd;
    QLIST_ENTRY(KVMParkedVcpu) node;
};

P
Paolo Bonzini 已提交
70
struct KVMState
A
aliguori 已提交
71
{
72 73
    AccelState parent_obj;

74
    int nr_slots;
A
aliguori 已提交
75 76
    int fd;
    int vmfd;
A
aliguori 已提交
77
    int coalesced_mmio;
78
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
79
    bool coalesced_flush_in_progress;
80
    int broken_set_mem_region;
81
    int vcpu_events;
82
    int robust_singlestep;
83
    int debugregs;
84 85 86
#ifdef KVM_CAP_SET_GUEST_DEBUG
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
#endif
87
    int many_ioeventfds;
88
    int intx_set_mask;
89 90 91
    /* The man page (and posix) say ioctl numbers are signed int, but
     * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
     * unsigned, and treating them as signed here can break things */
92
    unsigned irq_set_ioctl;
93
    unsigned int sigmask_len;
94
    GHashTable *gsimap;
95 96 97
#ifdef KVM_CAP_IRQ_ROUTING
    struct kvm_irq_routing *irq_routes;
    int nr_allocated_irq_routes;
98
    unsigned long *used_gsi_bitmap;
99
    unsigned int gsi_count;
100
    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
101
#endif
102
    KVMMemoryListener memory_listener;
G
Gu Zheng 已提交
103
    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
P
Paolo Bonzini 已提交
104
};
A
aliguori 已提交
105

106
KVMState *kvm_state;
107
bool kvm_kernel_irqchip;
108
bool kvm_split_irqchip;
109
bool kvm_async_interrupts_allowed;
110
bool kvm_halt_in_kernel_allowed;
111
bool kvm_eventfds_allowed;
112
bool kvm_irqfds_allowed;
113
bool kvm_resamplefds_allowed;
114
bool kvm_msi_via_irqfd_allowed;
115
bool kvm_gsi_routing_allowed;
116
bool kvm_gsi_direct_mapping;
117
bool kvm_allowed;
118
bool kvm_readonly_mem_allowed;
119
bool kvm_vm_attributes_allowed;
120
bool kvm_direct_msi_allowed;
121
bool kvm_ioeventfd_any_length_allowed;
A
aliguori 已提交
122

123 124 125 126 127 128
static const KVMCapabilityInfo kvm_required_capabilites[] = {
    KVM_CAP_INFO(USER_MEMORY),
    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
    KVM_CAP_LAST_INFO
};

129
static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
A
aliguori 已提交
130
{
131
    KVMState *s = kvm_state;
A
aliguori 已提交
132 133
    int i;

134
    for (i = 0; i < s->nr_slots; i++) {
135 136
        if (kml->slots[i].memory_size == 0) {
            return &kml->slots[i];
J
Jan Kiszka 已提交
137
        }
A
aliguori 已提交
138 139
    }

140 141 142 143 144
    return NULL;
}

bool kvm_has_free_slot(MachineState *ms)
{
145 146 147
    KVMState *s = KVM_STATE(ms->accelerator);

    return kvm_get_free_slot(&s->memory_listener);
148 149
}

150
static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
151
{
152
    KVMSlot *slot = kvm_get_free_slot(kml);
153 154 155 156 157

    if (slot) {
        return slot;
    }

158 159 160 161
    fprintf(stderr, "%s: no free slot available\n", __func__);
    abort();
}

162
static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
A
Avi Kivity 已提交
163 164
                                         hwaddr start_addr,
                                         hwaddr end_addr)
165
{
166
    KVMState *s = kvm_state;
167 168
    int i;

169
    for (i = 0; i < s->nr_slots; i++) {
170
        KVMSlot *mem = &kml->slots[i];
171 172 173 174 175 176 177

        if (start_addr == mem->start_addr &&
            end_addr == mem->start_addr + mem->memory_size) {
            return mem;
        }
    }

A
aliguori 已提交
178 179 180
    return NULL;
}

181 182 183
/*
 * Find overlapping slot with lowest start address
 */
184
static KVMSlot *kvm_lookup_overlapping_slot(KVMMemoryListener *kml,
A
Avi Kivity 已提交
185 186
                                            hwaddr start_addr,
                                            hwaddr end_addr)
A
aliguori 已提交
187
{
188
    KVMState *s = kvm_state;
189
    KVMSlot *found = NULL;
A
aliguori 已提交
190 191
    int i;

192
    for (i = 0; i < s->nr_slots; i++) {
193
        KVMSlot *mem = &kml->slots[i];
A
aliguori 已提交
194

195 196 197 198 199 200 201 202 203
        if (mem->memory_size == 0 ||
            (found && found->start_addr < mem->start_addr)) {
            continue;
        }

        if (end_addr > mem->start_addr &&
            start_addr < mem->start_addr + mem->memory_size) {
            found = mem;
        }
A
aliguori 已提交
204 205
    }

206
    return found;
A
aliguori 已提交
207 208
}

209
int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
A
Avi Kivity 已提交
210
                                       hwaddr *phys_addr)
211
{
212
    KVMMemoryListener *kml = &s->memory_listener;
213 214
    int i;

215
    for (i = 0; i < s->nr_slots; i++) {
216
        KVMSlot *mem = &kml->slots[i];
217

218 219
        if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
            *phys_addr = mem->start_addr + (ram - mem->ram);
220 221 222 223 224 225 226
            return 1;
        }
    }

    return 0;
}

227
static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
228
{
229
    KVMState *s = kvm_state;
230 231
    struct kvm_userspace_memory_region mem;

232
    mem.slot = slot->slot | (kml->as_id << 16);
233
    mem.guest_phys_addr = slot->start_addr;
234
    mem.userspace_addr = (unsigned long)slot->ram;
235
    mem.flags = slot->flags;
236 237

    if (slot->memory_size && mem.flags & KVM_MEM_READONLY) {
238 239 240 241 242 243
        /* Set the slot size to 0 before setting the slot to the desired
         * value. This is needed based on KVM commit 75d61fbc. */
        mem.memory_size = 0;
        kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
    }
    mem.memory_size = slot->memory_size;
244 245 246
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
}

G
Gu Zheng 已提交
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
int kvm_destroy_vcpu(CPUState *cpu)
{
    KVMState *s = kvm_state;
    long mmap_size;
    struct KVMParkedVcpu *vcpu = NULL;
    int ret = 0;

    DPRINTF("kvm_destroy_vcpu\n");

    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
    if (mmap_size < 0) {
        ret = mmap_size;
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
        goto err;
    }

    ret = munmap(cpu->kvm_run, mmap_size);
    if (ret < 0) {
        goto err;
    }

    vcpu = g_malloc0(sizeof(*vcpu));
    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
    vcpu->kvm_fd = cpu->kvm_fd;
    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
err:
    return ret;
}

static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
{
    struct KVMParkedVcpu *cpu;

    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
        if (cpu->vcpu_id == vcpu_id) {
            int kvm_fd;

            QLIST_REMOVE(cpu, node);
            kvm_fd = cpu->kvm_fd;
            g_free(cpu);
            return kvm_fd;
        }
    }

    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
}

294
int kvm_init_vcpu(CPUState *cpu)
A
aliguori 已提交
295 296 297 298 299
{
    KVMState *s = kvm_state;
    long mmap_size;
    int ret;

300
    DPRINTF("kvm_init_vcpu\n");
A
aliguori 已提交
301

G
Gu Zheng 已提交
302
    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
A
aliguori 已提交
303
    if (ret < 0) {
304
        DPRINTF("kvm_create_vcpu failed\n");
A
aliguori 已提交
305 306 307
        goto err;
    }

A
Andreas Färber 已提交
308
    cpu->kvm_fd = ret;
309
    cpu->kvm_state = s;
A
Andreas Färber 已提交
310
    cpu->kvm_vcpu_dirty = true;
A
aliguori 已提交
311 312 313

    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
    if (mmap_size < 0) {
314
        ret = mmap_size;
315
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
A
aliguori 已提交
316 317 318
        goto err;
    }

A
Andreas Färber 已提交
319
    cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
A
Andreas Färber 已提交
320
                        cpu->kvm_fd, 0);
A
Andreas Färber 已提交
321
    if (cpu->kvm_run == MAP_FAILED) {
A
aliguori 已提交
322
        ret = -errno;
323
        DPRINTF("mmap'ing vcpu state failed\n");
A
aliguori 已提交
324 325 326
        goto err;
    }

J
Jan Kiszka 已提交
327 328
    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
        s->coalesced_mmio_ring =
A
Andreas Färber 已提交
329
            (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
J
Jan Kiszka 已提交
330
    }
331

A
Andreas Färber 已提交
332
    ret = kvm_arch_init_vcpu(cpu);
A
aliguori 已提交
333 334 335 336
err:
    return ret;
}

337 338 339
/*
 * dirty pages logging control
 */
340

341
static int kvm_mem_flags(MemoryRegion *mr)
342
{
343
    bool readonly = mr->readonly || memory_region_is_romd(mr);
344
    int flags = 0;
345 346 347 348

    if (memory_region_get_dirty_log_mask(mr) != 0) {
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
    }
349 350 351 352
    if (readonly && kvm_readonly_mem_allowed) {
        flags |= KVM_MEM_READONLY;
    }
    return flags;
353 354
}

355 356
static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
                                 MemoryRegion *mr)
357
{
358 359 360
    int old_flags;

    old_flags = mem->flags;
361
    mem->flags = kvm_mem_flags(mr);
362

363
    /* If nothing changed effectively, no need to issue ioctl */
364
    if (mem->flags == old_flags) {
365
        return 0;
366 367
    }

368
    return kvm_set_user_memory_region(kml, mem);
369 370
}

371 372
static int kvm_section_update_flags(KVMMemoryListener *kml,
                                    MemoryRegionSection *section)
373
{
374 375
    hwaddr phys_addr = section->offset_within_address_space;
    ram_addr_t size = int128_get64(section->size);
376
    KVMSlot *mem = kvm_lookup_matching_slot(kml, phys_addr, phys_addr + size);
377 378

    if (mem == NULL)  {
379 380
        return 0;
    } else {
381
        return kvm_slot_update_flags(kml, mem, section->mr);
382 383 384
    }
}

A
Avi Kivity 已提交
385
static void kvm_log_start(MemoryListener *listener,
386 387
                          MemoryRegionSection *section,
                          int old, int new)
388
{
389
    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
A
Avi Kivity 已提交
390 391
    int r;

392 393 394 395
    if (old != 0) {
        return;
    }

396
    r = kvm_section_update_flags(kml, section);
A
Avi Kivity 已提交
397 398 399
    if (r < 0) {
        abort();
    }
400 401
}

A
Avi Kivity 已提交
402
static void kvm_log_stop(MemoryListener *listener,
403 404
                          MemoryRegionSection *section,
                          int old, int new)
405
{
406
    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
A
Avi Kivity 已提交
407 408
    int r;

409 410 411 412
    if (new != 0) {
        return;
    }

413
    r = kvm_section_update_flags(kml, section);
A
Avi Kivity 已提交
414 415 416
    if (r < 0) {
        abort();
    }
417 418
}

419
/* get kvm's dirty pages bitmap and update qemu's */
420 421
static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
                                         unsigned long *bitmap)
A
Alexander Graf 已提交
422
{
F
Fam Zheng 已提交
423 424
    ram_addr_t start = section->offset_within_region +
                       memory_region_get_ram_addr(section->mr);
425 426 427
    ram_addr_t pages = int128_get64(section->size) / getpagesize();

    cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages);
428
    return 0;
A
Alexander Graf 已提交
429 430
}

431 432
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))

433 434
/**
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
435 436 437
 * This function updates qemu's dirty bitmap using
 * memory_region_set_dirty().  This means all bits are set
 * to dirty.
438
 *
439
 * @start_add: start of logged region.
440 441
 * @end_addr: end of logged region.
 */
442 443
static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
                                          MemoryRegionSection *section)
444 445
{
    KVMState *s = kvm_state;
446
    unsigned long size, allocated_size = 0;
P
Paolo Bonzini 已提交
447
    struct kvm_dirty_log d = {};
448 449
    KVMSlot *mem;
    int ret = 0;
A
Avi Kivity 已提交
450
    hwaddr start_addr = section->offset_within_address_space;
451
    hwaddr end_addr = start_addr + int128_get64(section->size);
452

453 454
    d.dirty_bitmap = NULL;
    while (start_addr < end_addr) {
455
        mem = kvm_lookup_overlapping_slot(kml, start_addr, end_addr);
456 457 458
        if (mem == NULL) {
            break;
        }
459

460 461 462 463 464 465 466 467 468 469
        /* XXX bad kernel interface alert
         * For dirty bitmap, kernel allocates array of size aligned to
         * bits-per-long.  But for case when the kernel is 64bits and
         * the userspace is 32bits, userspace can't align to the same
         * bits-per-long, since sizeof(long) is different between kernel
         * and user space.  This way, userspace will provide buffer which
         * may be 4 bytes less than the kernel will use, resulting in
         * userspace memory corruption (which is not detectable by valgrind
         * too, in most cases).
         * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
S
Stefan Weil 已提交
470
         * a hope that sizeof(long) won't become >8 any time soon.
471 472 473
         */
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
                     /*HOST_LONG_BITS*/ 64) / 8;
474
        if (!d.dirty_bitmap) {
475
            d.dirty_bitmap = g_malloc(size);
476
        } else if (size > allocated_size) {
477
            d.dirty_bitmap = g_realloc(d.dirty_bitmap, size);
478 479 480
        }
        allocated_size = size;
        memset(d.dirty_bitmap, 0, allocated_size);
481

482
        d.slot = mem->slot | (kml->as_id << 16);
483
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
484
            DPRINTF("ioctl failed %d\n", errno);
485 486 487
            ret = -1;
            break;
        }
488

489
        kvm_get_dirty_pages_log_range(section, d.dirty_bitmap);
490
        start_addr = mem->start_addr + mem->memory_size;
491
    }
492
    g_free(d.dirty_bitmap);
493 494

    return ret;
495 496
}

497 498
static void kvm_coalesce_mmio_region(MemoryListener *listener,
                                     MemoryRegionSection *secion,
A
Avi Kivity 已提交
499
                                     hwaddr start, hwaddr size)
A
aliguori 已提交
500 501 502 503 504 505 506 507
{
    KVMState *s = kvm_state;

    if (s->coalesced_mmio) {
        struct kvm_coalesced_mmio_zone zone;

        zone.addr = start;
        zone.size = size;
508
        zone.pad = 0;
A
aliguori 已提交
509

510
        (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
A
aliguori 已提交
511 512 513
    }
}

514 515
static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
                                       MemoryRegionSection *secion,
A
Avi Kivity 已提交
516
                                       hwaddr start, hwaddr size)
A
aliguori 已提交
517 518 519 520 521 522 523 524
{
    KVMState *s = kvm_state;

    if (s->coalesced_mmio) {
        struct kvm_coalesced_mmio_zone zone;

        zone.addr = start;
        zone.size = size;
525
        zone.pad = 0;
A
aliguori 已提交
526

527
        (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
A
aliguori 已提交
528 529 530
    }
}

531 532 533 534 535 536 537 538 539 540 541 542
int kvm_check_extension(KVMState *s, unsigned int extension)
{
    int ret;

    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
    if (ret < 0) {
        ret = 0;
    }

    return ret;
}

543 544 545 546 547 548 549 550 551 552 553 554 555
int kvm_vm_check_extension(KVMState *s, unsigned int extension)
{
    int ret;

    ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
    if (ret < 0) {
        /* VM wide version not implemented, use global one instead */
        ret = kvm_check_extension(s, extension);
    }

    return ret;
}

556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575
static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
{
#if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
    /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN
     * endianness, but the memory core hands them in target endianness.
     * For example, PPC is always treated as big-endian even if running
     * on KVM and on PPC64LE.  Correct here.
     */
    switch (size) {
    case 2:
        val = bswap16(val);
        break;
    case 4:
        val = bswap32(val);
        break;
    }
#endif
    return val;
}

576
static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
577
                                  bool assign, uint32_t size, bool datamatch)
M
Michael S. Tsirkin 已提交
578 579
{
    int ret;
T
Thomas Huth 已提交
580 581 582 583 584 585 586
    struct kvm_ioeventfd iofd = {
        .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
        .addr = addr,
        .len = size,
        .flags = 0,
        .fd = fd,
    };
M
Michael S. Tsirkin 已提交
587 588 589 590 591

    if (!kvm_enabled()) {
        return -ENOSYS;
    }

592 593 594
    if (datamatch) {
        iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
    }
M
Michael S. Tsirkin 已提交
595 596 597 598 599 600 601 602 603 604 605 606 607
    if (!assign) {
        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
    }

    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);

    if (ret < 0) {
        return -errno;
    }

    return 0;
}

608
static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
609
                                 bool assign, uint32_t size, bool datamatch)
M
Michael S. Tsirkin 已提交
610 611
{
    struct kvm_ioeventfd kick = {
612
        .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
M
Michael S. Tsirkin 已提交
613
        .addr = addr,
614
        .flags = KVM_IOEVENTFD_FLAG_PIO,
615
        .len = size,
M
Michael S. Tsirkin 已提交
616 617 618 619 620 621
        .fd = fd,
    };
    int r;
    if (!kvm_enabled()) {
        return -ENOSYS;
    }
622 623 624
    if (datamatch) {
        kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
    }
M
Michael S. Tsirkin 已提交
625 626 627 628 629 630 631 632 633 634 635
    if (!assign) {
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
    }
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
    if (r < 0) {
        return r;
    }
    return 0;
}


636 637
static int kvm_check_many_ioeventfds(void)
{
638 639 640 641 642
    /* Userspace can use ioeventfd for io notification.  This requires a host
     * that supports eventfd(2) and an I/O thread; since eventfd does not
     * support SIGIO it cannot interrupt the vcpu.
     *
     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
643 644
     * can avoid creating too many ioeventfds.
     */
645
#if defined(CONFIG_EVENTFD)
646 647 648 649 650 651 652
    int ioeventfds[7];
    int i, ret = 0;
    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
        if (ioeventfds[i] < 0) {
            break;
        }
653
        ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
654 655 656 657 658 659 660 661 662 663
        if (ret < 0) {
            close(ioeventfds[i]);
            break;
        }
    }

    /* Decide whether many devices are supported or not */
    ret = i == ARRAY_SIZE(ioeventfds);

    while (i-- > 0) {
664
        kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
665 666 667 668 669 670 671 672
        close(ioeventfds[i]);
    }
    return ret;
#else
    return 0;
#endif
}

673 674 675 676 677 678 679 680 681 682 683 684
static const KVMCapabilityInfo *
kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
{
    while (list->name) {
        if (!kvm_check_extension(s, list->value)) {
            return list;
        }
        list++;
    }
    return NULL;
}

685 686
static void kvm_set_phys_mem(KVMMemoryListener *kml,
                             MemoryRegionSection *section, bool add)
687 688 689 690
{
    KVMState *s = kvm_state;
    KVMSlot *mem, old;
    int err;
A
Avi Kivity 已提交
691
    MemoryRegion *mr = section->mr;
692
    bool writeable = !mr->readonly && !mr->rom_device;
A
Avi Kivity 已提交
693
    hwaddr start_addr = section->offset_within_address_space;
694
    ram_addr_t size = int128_get64(section->size);
695
    void *ram = NULL;
A
Avi Kivity 已提交
696
    unsigned delta;
697

698
    /* kvm works in page size chunks, but the function may be called
699 700
       with sub-page size and unaligned start address. Pad the start
       address to next and truncate size to previous page boundary. */
701 702
    delta = qemu_real_host_page_size - (start_addr & ~qemu_real_host_page_mask);
    delta &= ~qemu_real_host_page_mask;
A
Avi Kivity 已提交
703 704 705 706 707
    if (delta > size) {
        return;
    }
    start_addr += delta;
    size -= delta;
708 709
    size &= qemu_real_host_page_mask;
    if (!size || (start_addr & ~qemu_real_host_page_mask)) {
A
Avi Kivity 已提交
710 711
        return;
    }
712

A
Avi Kivity 已提交
713
    if (!memory_region_is_ram(mr)) {
714 715 716 717 718 719 720
        if (writeable || !kvm_readonly_mem_allowed) {
            return;
        } else if (!mr->romd_mode) {
            /* If the memory device is not in romd_mode, then we actually want
             * to remove the kvm memory slot so all accesses will trap. */
            add = false;
        }
721 722
    }

A
Avi Kivity 已提交
723
    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + delta;
A
Avi Kivity 已提交
724

725
    while (1) {
726
        mem = kvm_lookup_overlapping_slot(kml, start_addr, start_addr + size);
727 728 729 730
        if (!mem) {
            break;
        }

A
Avi Kivity 已提交
731
        if (add && start_addr >= mem->start_addr &&
732
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
733
            (ram - start_addr == mem->ram - mem->start_addr)) {
734
            /* The new slot fits into the existing one and comes with
735
             * identical parameters - update flags and done. */
736
            kvm_slot_update_flags(kml, mem, mr);
737 738 739 740 741
            return;
        }

        old = *mem;

742
        if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
743
            kvm_physical_sync_dirty_bitmap(kml, section);
744 745
        }

746 747
        /* unregister the overlapping slot */
        mem->memory_size = 0;
748
        err = kvm_set_user_memory_region(kml, mem);
749 750 751 752 753 754 755 756 757 758 759 760 761 762 763
        if (err) {
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
                    __func__, strerror(-err));
            abort();
        }

        /* Workaround for older KVM versions: we can't join slots, even not by
         * unregistering the previous ones and then registering the larger
         * slot. We have to maintain the existing fragmentation. Sigh.
         *
         * This workaround assumes that the new slot starts at the same
         * address as the first existing one. If not or if some overlapping
         * slot comes around later, we will fail (not seen in practice so far)
         * - and actually require a recent KVM version. */
        if (s->broken_set_mem_region &&
A
Avi Kivity 已提交
764
            old.start_addr == start_addr && old.memory_size < size && add) {
765
            mem = kvm_alloc_slot(kml);
766 767
            mem->memory_size = old.memory_size;
            mem->start_addr = old.start_addr;
768
            mem->ram = old.ram;
769
            mem->flags = kvm_mem_flags(mr);
770

771
            err = kvm_set_user_memory_region(kml, mem);
772 773 774 775 776 777 778
            if (err) {
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
                        strerror(-err));
                abort();
            }

            start_addr += old.memory_size;
779
            ram += old.memory_size;
780 781 782 783 784 785
            size -= old.memory_size;
            continue;
        }

        /* register prefix slot */
        if (old.start_addr < start_addr) {
786
            mem = kvm_alloc_slot(kml);
787 788
            mem->memory_size = start_addr - old.start_addr;
            mem->start_addr = old.start_addr;
789
            mem->ram = old.ram;
790
            mem->flags =  kvm_mem_flags(mr);
791

792
            err = kvm_set_user_memory_region(kml, mem);
793 794 795
            if (err) {
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
                        __func__, strerror(-err));
796 797 798 799 800
#ifdef TARGET_PPC
                fprintf(stderr, "%s: This is probably because your kernel's " \
                                "PAGE_SIZE is too big. Please try to use 4k " \
                                "PAGE_SIZE!\n", __func__);
#endif
801 802 803 804 805 806 807 808
                abort();
            }
        }

        /* register suffix slot */
        if (old.start_addr + old.memory_size > start_addr + size) {
            ram_addr_t size_delta;

809
            mem = kvm_alloc_slot(kml);
810 811 812
            mem->start_addr = start_addr + size;
            size_delta = mem->start_addr - old.start_addr;
            mem->memory_size = old.memory_size - size_delta;
813
            mem->ram = old.ram + size_delta;
814
            mem->flags = kvm_mem_flags(mr);
815

816
            err = kvm_set_user_memory_region(kml, mem);
817 818 819 820 821 822 823 824 825
            if (err) {
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
                        __func__, strerror(-err));
                abort();
            }
        }
    }

    /* in case the KVM bug workaround already "consumed" the new slot */
J
Jan Kiszka 已提交
826
    if (!size) {
827
        return;
J
Jan Kiszka 已提交
828
    }
A
Avi Kivity 已提交
829
    if (!add) {
830
        return;
J
Jan Kiszka 已提交
831
    }
832
    mem = kvm_alloc_slot(kml);
833 834
    mem->memory_size = size;
    mem->start_addr = start_addr;
835
    mem->ram = ram;
836
    mem->flags = kvm_mem_flags(mr);
837

838
    err = kvm_set_user_memory_region(kml, mem);
839 840 841 842 843 844 845
    if (err) {
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
                strerror(-err));
        abort();
    }
}

A
Avi Kivity 已提交
846 847 848
static void kvm_region_add(MemoryListener *listener,
                           MemoryRegionSection *section)
{
849 850
    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);

P
Paolo Bonzini 已提交
851
    memory_region_ref(section->mr);
852
    kvm_set_phys_mem(kml, section, true);
A
Avi Kivity 已提交
853 854 855 856 857
}

static void kvm_region_del(MemoryListener *listener,
                           MemoryRegionSection *section)
{
858 859 860
    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);

    kvm_set_phys_mem(kml, section, false);
P
Paolo Bonzini 已提交
861
    memory_region_unref(section->mr);
A
Avi Kivity 已提交
862 863 864 865
}

static void kvm_log_sync(MemoryListener *listener,
                         MemoryRegionSection *section)
866
{
867
    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
A
Avi Kivity 已提交
868 869
    int r;

870
    r = kvm_physical_sync_dirty_bitmap(kml, section);
A
Avi Kivity 已提交
871 872 873
    if (r < 0) {
        abort();
    }
874 875
}

876 877 878 879 880 881
static void kvm_mem_ioeventfd_add(MemoryListener *listener,
                                  MemoryRegionSection *section,
                                  bool match_data, uint64_t data,
                                  EventNotifier *e)
{
    int fd = event_notifier_get_fd(e);
882 883
    int r;

884
    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
885 886
                               data, true, int128_get64(section->size),
                               match_data);
887
    if (r < 0) {
888 889
        fprintf(stderr, "%s: error adding ioeventfd: %s\n",
                __func__, strerror(-r));
890 891 892 893
        abort();
    }
}

894 895 896 897
static void kvm_mem_ioeventfd_del(MemoryListener *listener,
                                  MemoryRegionSection *section,
                                  bool match_data, uint64_t data,
                                  EventNotifier *e)
898
{
899
    int fd = event_notifier_get_fd(e);
900 901
    int r;

902
    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
903 904
                               data, false, int128_get64(section->size),
                               match_data);
905 906 907 908 909
    if (r < 0) {
        abort();
    }
}

910 911 912 913
static void kvm_io_ioeventfd_add(MemoryListener *listener,
                                 MemoryRegionSection *section,
                                 bool match_data, uint64_t data,
                                 EventNotifier *e)
914
{
915
    int fd = event_notifier_get_fd(e);
916 917
    int r;

918
    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
919 920
                              data, true, int128_get64(section->size),
                              match_data);
921
    if (r < 0) {
922 923
        fprintf(stderr, "%s: error adding ioeventfd: %s\n",
                __func__, strerror(-r));
924 925 926 927
        abort();
    }
}

928 929 930 931
static void kvm_io_ioeventfd_del(MemoryListener *listener,
                                 MemoryRegionSection *section,
                                 bool match_data, uint64_t data,
                                 EventNotifier *e)
932 933

{
934
    int fd = event_notifier_get_fd(e);
935 936
    int r;

937
    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
938 939
                              data, false, int128_get64(section->size),
                              match_data);
940 941 942 943 944
    if (r < 0) {
        abort();
    }
}

945 946
void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
                                  AddressSpace *as, int as_id)
947 948 949 950
{
    int i;

    kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
951
    kml->as_id = as_id;
952 953 954 955 956 957 958 959 960 961 962 963 964 965

    for (i = 0; i < s->nr_slots; i++) {
        kml->slots[i].slot = i;
    }

    kml->listener.region_add = kvm_region_add;
    kml->listener.region_del = kvm_region_del;
    kml->listener.log_start = kvm_log_start;
    kml->listener.log_stop = kvm_log_stop;
    kml->listener.log_sync = kvm_log_sync;
    kml->listener.priority = 10;

    memory_listener_register(&kml->listener, as);
}
966 967 968 969

static MemoryListener kvm_io_listener = {
    .eventfd_add = kvm_io_ioeventfd_add,
    .eventfd_del = kvm_io_ioeventfd_del,
970
    .priority = 10,
971 972
};

973
static void kvm_handle_interrupt(CPUState *cpu, int mask)
974
{
975
    cpu->interrupt_request |= mask;
976

977
    if (!qemu_cpu_is_self(cpu)) {
978
        qemu_cpu_kick(cpu);
979 980 981
    }
}

982
int kvm_set_irq(KVMState *s, int irq, int level)
983 984 985 986
{
    struct kvm_irq_level event;
    int ret;

987
    assert(kvm_async_interrupts_enabled());
988 989 990

    event.level = level;
    event.irq = irq;
991
    ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
992
    if (ret < 0) {
993
        perror("kvm_set_irq");
994 995 996
        abort();
    }

997
    return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
998 999 1000
}

#ifdef KVM_CAP_IRQ_ROUTING
1001 1002 1003 1004 1005
typedef struct KVMMSIRoute {
    struct kvm_irq_routing_entry kroute;
    QTAILQ_ENTRY(KVMMSIRoute) entry;
} KVMMSIRoute;

1006 1007
static void set_gsi(KVMState *s, unsigned int gsi)
{
1008
    set_bit(gsi, s->used_gsi_bitmap);
1009 1010
}

1011 1012
static void clear_gsi(KVMState *s, unsigned int gsi)
{
1013
    clear_bit(gsi, s->used_gsi_bitmap);
1014 1015
}

1016
void kvm_init_irq_routing(KVMState *s)
1017
{
1018
    int gsi_count, i;
1019

A
Alexander Graf 已提交
1020
    gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
1021 1022
    if (gsi_count > 0) {
        /* Round up so we can search ints using ffs */
1023
        s->used_gsi_bitmap = bitmap_new(gsi_count);
1024
        s->gsi_count = gsi_count;
1025 1026 1027 1028 1029
    }

    s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
    s->nr_allocated_irq_routes = 0;

1030
    if (!kvm_direct_msi_allowed) {
1031 1032 1033
        for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
            QTAILQ_INIT(&s->msi_hashtab[i]);
        }
1034 1035
    }

1036 1037 1038
    kvm_arch_init_irq_routing(s);
}

1039
void kvm_irqchip_commit_routes(KVMState *s)
1040 1041 1042 1043 1044 1045 1046 1047
{
    int ret;

    s->irq_routes->flags = 0;
    ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
    assert(ret == 0);
}

1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065
static void kvm_add_routing_entry(KVMState *s,
                                  struct kvm_irq_routing_entry *entry)
{
    struct kvm_irq_routing_entry *new;
    int n, size;

    if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
        n = s->nr_allocated_irq_routes * 2;
        if (n < 64) {
            n = 64;
        }
        size = sizeof(struct kvm_irq_routing);
        size += n * sizeof(*new);
        s->irq_routes = g_realloc(s->irq_routes, size);
        s->nr_allocated_irq_routes = n;
    }
    n = s->irq_routes->nr++;
    new = &s->irq_routes->entries[n];
1066 1067

    *new = *entry;
1068 1069 1070 1071

    set_gsi(s, entry->gsi);
}

1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083
static int kvm_update_routing_entry(KVMState *s,
                                    struct kvm_irq_routing_entry *new_entry)
{
    struct kvm_irq_routing_entry *entry;
    int n;

    for (n = 0; n < s->irq_routes->nr; n++) {
        entry = &s->irq_routes->entries[n];
        if (entry->gsi != new_entry->gsi) {
            continue;
        }

1084 1085 1086 1087
        if(!memcmp(entry, new_entry, sizeof *entry)) {
            return 0;
        }

1088
        *entry = *new_entry;
1089 1090 1091 1092 1093 1094 1095 1096 1097

        kvm_irqchip_commit_routes(s);

        return 0;
    }

    return -ESRCH;
}

1098
void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1099
{
1100
    struct kvm_irq_routing_entry e = {};
1101

1102 1103
    assert(pin < s->gsi_count);

1104 1105 1106 1107 1108 1109 1110 1111
    e.gsi = irq;
    e.type = KVM_IRQ_ROUTING_IRQCHIP;
    e.flags = 0;
    e.u.irqchip.irqchip = irqchip;
    e.u.irqchip.pin = pin;
    kvm_add_routing_entry(s, &e);
}

1112
void kvm_irqchip_release_virq(KVMState *s, int virq)
1113 1114 1115 1116
{
    struct kvm_irq_routing_entry *e;
    int i;

1117 1118 1119 1120
    if (kvm_gsi_direct_mapping()) {
        return;
    }

1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153
    for (i = 0; i < s->irq_routes->nr; i++) {
        e = &s->irq_routes->entries[i];
        if (e->gsi == virq) {
            s->irq_routes->nr--;
            *e = s->irq_routes->entries[s->irq_routes->nr];
        }
    }
    clear_gsi(s, virq);
}

static unsigned int kvm_hash_msi(uint32_t data)
{
    /* This is optimized for IA32 MSI layout. However, no other arch shall
     * repeat the mistake of not providing a direct MSI injection API. */
    return data & 0xff;
}

static void kvm_flush_dynamic_msi_routes(KVMState *s)
{
    KVMMSIRoute *route, *next;
    unsigned int hash;

    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
            kvm_irqchip_release_virq(s, route->kroute.gsi);
            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
            g_free(route);
        }
    }
}

static int kvm_irqchip_get_virq(KVMState *s)
{
1154
    int next_virq;
1155

1156 1157 1158 1159 1160 1161
    /*
     * PIC and IOAPIC share the first 16 GSI numbers, thus the available
     * GSI numbers are more than the number of IRQ route. Allocating a GSI
     * number can succeed even though a new route entry cannot be added.
     * When this happens, flush dynamic MSI entries to free IRQ route entries.
     */
1162
    if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) {
1163 1164 1165
        kvm_flush_dynamic_msi_routes(s);
    }

1166
    /* Return the lowest unused GSI in the bitmap */
1167 1168 1169 1170 1171
    next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
    if (next_virq >= s->gsi_count) {
        return -ENOSPC;
    } else {
        return next_virq;
1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182
    }
}

static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
{
    unsigned int hash = kvm_hash_msi(msg.data);
    KVMMSIRoute *route;

    QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
        if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
            route->kroute.u.msi.address_hi == (msg.address >> 32) &&
1183
            route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
1184 1185 1186 1187 1188 1189 1190 1191
            return route;
        }
    }
    return NULL;
}

int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
{
1192
    struct kvm_msi msi;
1193 1194
    KVMMSIRoute *route;

1195
    if (kvm_direct_msi_allowed) {
1196 1197
        msi.address_lo = (uint32_t)msg.address;
        msi.address_hi = msg.address >> 32;
1198
        msi.data = le32_to_cpu(msg.data);
1199 1200 1201 1202 1203 1204
        msi.flags = 0;
        memset(msi.pad, 0, sizeof(msi.pad));

        return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
    }

1205 1206
    route = kvm_lookup_msi_route(s, msg);
    if (!route) {
1207
        int virq;
1208 1209 1210 1211 1212 1213

        virq = kvm_irqchip_get_virq(s);
        if (virq < 0) {
            return virq;
        }

1214
        route = g_malloc0(sizeof(KVMMSIRoute));
1215 1216 1217 1218 1219
        route->kroute.gsi = virq;
        route->kroute.type = KVM_IRQ_ROUTING_MSI;
        route->kroute.flags = 0;
        route->kroute.u.msi.address_lo = (uint32_t)msg.address;
        route->kroute.u.msi.address_hi = msg.address >> 32;
1220
        route->kroute.u.msi.data = le32_to_cpu(msg.data);
1221 1222

        kvm_add_routing_entry(s, &route->kroute);
1223
        kvm_irqchip_commit_routes(s);
1224 1225 1226 1227 1228 1229 1230

        QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
                           entry);
    }

    assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);

1231
    return kvm_set_irq(s, route->kroute.gsi, 1);
1232 1233
}

1234
int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg, PCIDevice *dev)
1235
{
1236
    struct kvm_irq_routing_entry kroute = {};
1237 1238
    int virq;

1239
    if (kvm_gsi_direct_mapping()) {
1240
        return kvm_arch_msi_data_to_gsi(msg.data);
1241 1242
    }

1243
    if (!kvm_gsi_routing_enabled()) {
1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256
        return -ENOSYS;
    }

    virq = kvm_irqchip_get_virq(s);
    if (virq < 0) {
        return virq;
    }

    kroute.gsi = virq;
    kroute.type = KVM_IRQ_ROUTING_MSI;
    kroute.flags = 0;
    kroute.u.msi.address_lo = (uint32_t)msg.address;
    kroute.u.msi.address_hi = msg.address >> 32;
1257
    kroute.u.msi.data = le32_to_cpu(msg.data);
1258
    if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1259 1260 1261
        kvm_irqchip_release_virq(s, virq);
        return -EINVAL;
    }
1262 1263

    kvm_add_routing_entry(s, &kroute);
1264
    kvm_irqchip_commit_routes(s);
1265 1266 1267 1268

    return virq;
}

1269 1270
int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
                                 PCIDevice *dev)
1271
{
1272
    struct kvm_irq_routing_entry kroute = {};
1273

1274 1275 1276 1277
    if (kvm_gsi_direct_mapping()) {
        return 0;
    }

1278 1279 1280 1281 1282 1283 1284 1285 1286
    if (!kvm_irqchip_in_kernel()) {
        return -ENOSYS;
    }

    kroute.gsi = virq;
    kroute.type = KVM_IRQ_ROUTING_MSI;
    kroute.flags = 0;
    kroute.u.msi.address_lo = (uint32_t)msg.address;
    kroute.u.msi.address_hi = msg.address >> 32;
1287
    kroute.u.msi.data = le32_to_cpu(msg.data);
1288
    if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1289 1290
        return -EINVAL;
    }
1291 1292 1293 1294

    return kvm_update_routing_entry(s, &kroute);
}

1295 1296
static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq,
                                    bool assign)
1297 1298 1299 1300 1301 1302 1303
{
    struct kvm_irqfd irqfd = {
        .fd = fd,
        .gsi = virq,
        .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
    };

1304 1305 1306 1307 1308
    if (rfd != -1) {
        irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
        irqfd.resamplefd = rfd;
    }

1309
    if (!kvm_irqfds_enabled()) {
1310 1311 1312 1313 1314 1315
        return -ENOSYS;
    }

    return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
}

1316 1317
int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
{
1318
    struct kvm_irq_routing_entry kroute = {};
1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343
    int virq;

    if (!kvm_gsi_routing_enabled()) {
        return -ENOSYS;
    }

    virq = kvm_irqchip_get_virq(s);
    if (virq < 0) {
        return virq;
    }

    kroute.gsi = virq;
    kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER;
    kroute.flags = 0;
    kroute.u.adapter.summary_addr = adapter->summary_addr;
    kroute.u.adapter.ind_addr = adapter->ind_addr;
    kroute.u.adapter.summary_offset = adapter->summary_offset;
    kroute.u.adapter.ind_offset = adapter->ind_offset;
    kroute.u.adapter.adapter_id = adapter->adapter_id;

    kvm_add_routing_entry(s, &kroute);

    return virq;
}

1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371
int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
{
    struct kvm_irq_routing_entry kroute = {};
    int virq;

    if (!kvm_gsi_routing_enabled()) {
        return -ENOSYS;
    }
    if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) {
        return -ENOSYS;
    }
    virq = kvm_irqchip_get_virq(s);
    if (virq < 0) {
        return virq;
    }

    kroute.gsi = virq;
    kroute.type = KVM_IRQ_ROUTING_HV_SINT;
    kroute.flags = 0;
    kroute.u.hv_sint.vcpu = vcpu;
    kroute.u.hv_sint.sint = sint;

    kvm_add_routing_entry(s, &kroute);
    kvm_irqchip_commit_routes(s);

    return virq;
}

1372 1373
#else /* !KVM_CAP_IRQ_ROUTING */

1374
void kvm_init_irq_routing(KVMState *s)
1375 1376
{
}
1377

1378 1379 1380 1381
void kvm_irqchip_release_virq(KVMState *s, int virq)
{
}

1382 1383 1384 1385
int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
{
    abort();
}
1386 1387 1388

int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
{
1389
    return -ENOSYS;
1390
}
1391

1392 1393 1394 1395 1396
int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
{
    return -ENOSYS;
}

1397 1398 1399 1400 1401
int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
{
    return -ENOSYS;
}

1402 1403 1404 1405
static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
{
    abort();
}
1406 1407 1408 1409 1410

int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
{
    return -ENOSYS;
}
1411 1412
#endif /* !KVM_CAP_IRQ_ROUTING */

1413 1414
int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
                                       EventNotifier *rn, int virq)
1415
{
1416 1417
    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n),
           rn ? event_notifier_get_fd(rn) : -1, virq, true);
1418 1419
}

1420 1421
int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
                                          int virq)
1422
{
1423 1424
    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq,
           false);
1425 1426
}

1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455
int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
                                   EventNotifier *rn, qemu_irq irq)
{
    gpointer key, gsi;
    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);

    if (!found) {
        return -ENXIO;
    }
    return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
}

int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
                                      qemu_irq irq)
{
    gpointer key, gsi;
    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);

    if (!found) {
        return -ENXIO;
    }
    return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
}

void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
{
    g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
}

1456
static void kvm_irqchip_create(MachineState *machine, KVMState *s)
1457 1458 1459
{
    int ret;

1460 1461 1462 1463 1464 1465 1466 1467 1468 1469
    if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
        ;
    } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
        ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
        if (ret < 0) {
            fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
            exit(1);
        }
    } else {
        return;
1470 1471
    }

1472 1473
    /* First probe and see if there's a arch-specific hook to create the
     * in-kernel irqchip for us */
1474
    ret = kvm_arch_irqchip_create(machine, s);
1475
    if (ret == 0) {
1476 1477 1478 1479 1480 1481
        if (machine_kernel_irqchip_split(machine)) {
            perror("Split IRQ chip mode not supported.");
            exit(1);
        } else {
            ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
        }
1482 1483 1484 1485
    }
    if (ret < 0) {
        fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
        exit(1);
1486 1487
    }

1488
    kvm_kernel_irqchip = true;
1489 1490 1491 1492
    /* If we have an in-kernel IRQ chip then we must have asynchronous
     * interrupt delivery (though the reverse is not necessarily true)
     */
    kvm_async_interrupts_allowed = true;
1493
    kvm_halt_in_kernel_allowed = true;
1494 1495 1496

    kvm_init_irq_routing(s);

1497
    s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
1498 1499
}

1500 1501 1502 1503 1504
/* Find number of supported CPUs using the recommended
 * procedure from the kernel API documentation to cope with
 * older kernels that may be missing capabilities.
 */
static int kvm_recommended_vcpus(KVMState *s)
1505
{
1506 1507 1508
    int ret = kvm_check_extension(s, KVM_CAP_NR_VCPUS);
    return (ret) ? ret : 4;
}
1509

1510 1511 1512 1513
static int kvm_max_vcpus(KVMState *s)
{
    int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
    return (ret) ? ret : kvm_recommended_vcpus(s);
1514 1515
}

G
Greg Kurz 已提交
1516 1517 1518 1519 1520 1521
bool kvm_vcpu_id_is_valid(int vcpu_id)
{
    KVMState *s = KVM_STATE(current_machine->accelerator);
    return vcpu_id >= 0 && vcpu_id < kvm_max_vcpus(s);
}

1522
static int kvm_init(MachineState *ms)
A
aliguori 已提交
1523
{
1524
    MachineClass *mc = MACHINE_GET_CLASS(ms);
1525 1526 1527
    static const char upgrade_note[] =
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
        "(see http://sourceforge.net/projects/kvm).\n";
1528 1529 1530 1531 1532 1533 1534 1535 1536
    struct {
        const char *name;
        int num;
    } num_cpus[] = {
        { "SMP",          smp_cpus },
        { "hotpluggable", max_cpus },
        { NULL, }
    }, *nc = num_cpus;
    int soft_vcpus_limit, hard_vcpus_limit;
A
aliguori 已提交
1537
    KVMState *s;
1538
    const KVMCapabilityInfo *missing_cap;
A
aliguori 已提交
1539
    int ret;
1540
    int type = 0;
1541
    const char *kvm_type;
A
aliguori 已提交
1542

1543
    s = KVM_STATE(ms->accelerator);
A
aliguori 已提交
1544

1545 1546 1547 1548 1549 1550 1551 1552
    /*
     * On systems where the kernel can support different base page
     * sizes, host page size may be different from TARGET_PAGE_SIZE,
     * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
     * page size for the system though.
     */
    assert(TARGET_PAGE_SIZE <= getpagesize());

1553 1554
    s->sigmask_len = 8;

1555
#ifdef KVM_CAP_SET_GUEST_DEBUG
B
Blue Swirl 已提交
1556
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
1557
#endif
G
Gu Zheng 已提交
1558
    QLIST_INIT(&s->kvm_parked_vcpus);
A
aliguori 已提交
1559
    s->vmfd = -1;
K
Kevin Wolf 已提交
1560
    s->fd = qemu_open("/dev/kvm", O_RDWR);
A
aliguori 已提交
1561 1562 1563 1564 1565 1566 1567 1568
    if (s->fd == -1) {
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
        ret = -errno;
        goto err;
    }

    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
    if (ret < KVM_API_VERSION) {
1569
        if (ret >= 0) {
A
aliguori 已提交
1570
            ret = -EINVAL;
J
Jan Kiszka 已提交
1571
        }
A
aliguori 已提交
1572 1573 1574 1575 1576 1577 1578 1579 1580 1581
        fprintf(stderr, "kvm version too old\n");
        goto err;
    }

    if (ret > KVM_API_VERSION) {
        ret = -EINVAL;
        fprintf(stderr, "kvm version not supported\n");
        goto err;
    }

1582 1583 1584 1585 1586 1587 1588
    s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);

    /* If unspecified, use the default value */
    if (!s->nr_slots) {
        s->nr_slots = 32;
    }

1589 1590 1591
    /* check the vcpu limits */
    soft_vcpus_limit = kvm_recommended_vcpus(s);
    hard_vcpus_limit = kvm_max_vcpus(s);
1592

1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603
    while (nc->name) {
        if (nc->num > soft_vcpus_limit) {
            fprintf(stderr,
                    "Warning: Number of %s cpus requested (%d) exceeds "
                    "the recommended cpus supported by KVM (%d)\n",
                    nc->name, nc->num, soft_vcpus_limit);

            if (nc->num > hard_vcpus_limit) {
                fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
                        "the maximum cpus supported by KVM (%d)\n",
                        nc->name, nc->num, hard_vcpus_limit);
1604
                exit(1);
1605 1606 1607
            }
        }
        nc++;
1608 1609
    }

1610
    kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type");
1611 1612
    if (mc->kvm_type) {
        type = mc->kvm_type(kvm_type);
1613
    } else if (kvm_type) {
1614
        ret = -EINVAL;
1615 1616 1617 1618
        fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type);
        goto err;
    }

T
thomas knych 已提交
1619
    do {
1620
        ret = kvm_ioctl(s, KVM_CREATE_VM, type);
T
thomas knych 已提交
1621 1622 1623
    } while (ret == -EINTR);

    if (ret < 0) {
1624
        fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
T
thomas knych 已提交
1625 1626
                strerror(-ret));

1627
#ifdef TARGET_S390X
1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638
        if (ret == -EINVAL) {
            fprintf(stderr,
                    "Host kernel setup problem detected. Please verify:\n");
            fprintf(stderr, "- for kernels supporting the switch_amode or"
                    " user_mode parameters, whether\n");
            fprintf(stderr,
                    "  user space is running in primary address space\n");
            fprintf(stderr,
                    "- for kernels supporting the vm.allocate_pgste sysctl, "
                    "whether it is enabled\n");
        }
1639
#endif
A
aliguori 已提交
1640
        goto err;
1641
    }
A
aliguori 已提交
1642

T
thomas knych 已提交
1643
    s->vmfd = ret;
1644 1645 1646 1647
    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
    if (!missing_cap) {
        missing_cap =
            kvm_check_extension_list(s, kvm_arch_required_capabilities);
A
aliguori 已提交
1648
    }
1649
    if (missing_cap) {
1650
        ret = -EINVAL;
1651 1652
        fprintf(stderr, "kvm does not support %s\n%s",
                missing_cap->name, upgrade_note);
1653 1654 1655
        goto err;
    }

1656
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
A
aliguori 已提交
1657

1658
    s->broken_set_mem_region = 1;
1659
    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
1660 1661 1662 1663
    if (ret > 0) {
        s->broken_set_mem_region = 0;
    }

1664 1665 1666 1667
#ifdef KVM_CAP_VCPU_EVENTS
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
#endif

1668 1669 1670
    s->robust_singlestep =
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);

1671 1672 1673 1674
#ifdef KVM_CAP_DEBUGREGS
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
#endif

1675
#ifdef KVM_CAP_IRQ_ROUTING
1676
    kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
1677
#endif
1678

1679 1680
    s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);

1681
    s->irq_set_ioctl = KVM_IRQ_LINE;
1682
    if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
1683
        s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
1684 1685
    }

1686 1687 1688 1689 1690
#ifdef KVM_CAP_READONLY_MEM
    kvm_readonly_mem_allowed =
        (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
#endif

1691 1692 1693
    kvm_eventfds_allowed =
        (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0);

1694 1695 1696 1697 1698 1699
    kvm_irqfds_allowed =
        (kvm_check_extension(s, KVM_CAP_IRQFD) > 0);

    kvm_resamplefds_allowed =
        (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);

1700 1701 1702
    kvm_vm_attributes_allowed =
        (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);

1703 1704 1705
    kvm_ioeventfd_any_length_allowed =
        (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0);

1706
    ret = kvm_arch_init(ms, s);
J
Jan Kiszka 已提交
1707
    if (ret < 0) {
A
aliguori 已提交
1708
        goto err;
J
Jan Kiszka 已提交
1709
    }
A
aliguori 已提交
1710

1711 1712
    if (machine_kernel_irqchip_allowed(ms)) {
        kvm_irqchip_create(ms, s);
1713 1714
    }

A
aliguori 已提交
1715
    kvm_state = s;
1716

P
Pavel Fedin 已提交
1717 1718 1719 1720
    if (kvm_eventfds_allowed) {
        s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
        s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
    }
1721 1722 1723 1724
    s->memory_listener.listener.coalesced_mmio_add = kvm_coalesce_mmio_region;
    s->memory_listener.listener.coalesced_mmio_del = kvm_uncoalesce_mmio_region;

    kvm_memory_listener_register(s, &s->memory_listener,
1725
                                 &address_space_memory, 0);
1726 1727
    memory_listener_register(&kvm_io_listener,
                             &address_space_io);
A
aliguori 已提交
1728

1729 1730
    s->many_ioeventfds = kvm_check_many_ioeventfds();

1731 1732
    cpu_interrupt_handler = kvm_handle_interrupt;

A
aliguori 已提交
1733 1734 1735
    return 0;

err:
1736
    assert(ret < 0);
1737 1738 1739 1740 1741
    if (s->vmfd >= 0) {
        close(s->vmfd);
    }
    if (s->fd != -1) {
        close(s->fd);
A
aliguori 已提交
1742
    }
1743
    g_free(s->memory_listener.slots);
A
aliguori 已提交
1744 1745 1746 1747

    return ret;
}

1748 1749 1750 1751 1752
void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
{
    s->sigmask_len = sigmask_len;
}

1753 1754
static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
                          int size, uint32_t count)
A
aliguori 已提交
1755 1756 1757 1758 1759
{
    int i;
    uint8_t *ptr = data;

    for (i = 0; i < count; i++) {
1760
        address_space_rw(&address_space_io, port, attrs,
1761
                         ptr, size,
J
Jan Kiszka 已提交
1762
                         direction == KVM_EXIT_IO_OUT);
A
aliguori 已提交
1763 1764 1765 1766
        ptr += size;
    }
}

1767
static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
M
Marcelo Tosatti 已提交
1768
{
1769 1770 1771
    fprintf(stderr, "KVM internal error. Suberror: %d\n",
            run->internal.suberror);

M
Marcelo Tosatti 已提交
1772 1773 1774 1775 1776 1777 1778 1779 1780 1781
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
        int i;

        for (i = 0; i < run->internal.ndata; ++i) {
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
                    i, (uint64_t)run->internal.data[i]);
        }
    }
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
        fprintf(stderr, "emulation failure\n");
A
Andreas Färber 已提交
1782
        if (!kvm_arch_stop_on_emulation_error(cpu)) {
1783
            cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
1784
            return EXCP_INTERRUPT;
J
Jan Kiszka 已提交
1785
        }
M
Marcelo Tosatti 已提交
1786 1787 1788 1789
    }
    /* FIXME: Should trigger a qmp message to let management know
     * something went wrong.
     */
J
Jan Kiszka 已提交
1790
    return -1;
M
Marcelo Tosatti 已提交
1791 1792
}

1793
void kvm_flush_coalesced_mmio_buffer(void)
A
aliguori 已提交
1794 1795
{
    KVMState *s = kvm_state;
1796 1797 1798 1799 1800 1801 1802

    if (s->coalesced_flush_in_progress) {
        return;
    }

    s->coalesced_flush_in_progress = true;

1803 1804
    if (s->coalesced_mmio_ring) {
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
A
aliguori 已提交
1805 1806 1807 1808 1809 1810
        while (ring->first != ring->last) {
            struct kvm_coalesced_mmio *ent;

            ent = &ring->coalesced_mmio[ring->first];

            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
1811
            smp_wmb();
A
aliguori 已提交
1812 1813 1814
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
        }
    }
1815 1816

    s->coalesced_flush_in_progress = false;
A
aliguori 已提交
1817 1818
}

A
Andreas Färber 已提交
1819
static void do_kvm_cpu_synchronize_state(void *arg)
1820
{
A
Andreas Färber 已提交
1821
    CPUState *cpu = arg;
1822

A
Andreas Färber 已提交
1823 1824 1825
    if (!cpu->kvm_vcpu_dirty) {
        kvm_arch_get_registers(cpu);
        cpu->kvm_vcpu_dirty = true;
1826 1827 1828
    }
}

1829
void kvm_cpu_synchronize_state(CPUState *cpu)
1830
{
A
Andreas Färber 已提交
1831 1832
    if (!cpu->kvm_vcpu_dirty) {
        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu);
J
Jan Kiszka 已提交
1833
    }
1834 1835
}

1836
static void do_kvm_cpu_synchronize_post_reset(void *arg)
1837
{
1838 1839
    CPUState *cpu = arg;

A
Andreas Färber 已提交
1840 1841
    kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
    cpu->kvm_vcpu_dirty = false;
1842 1843
}

1844 1845 1846 1847 1848 1849
void kvm_cpu_synchronize_post_reset(CPUState *cpu)
{
    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, cpu);
}

static void do_kvm_cpu_synchronize_post_init(void *arg)
1850
{
1851 1852
    CPUState *cpu = arg;

A
Andreas Färber 已提交
1853 1854
    kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
    cpu->kvm_vcpu_dirty = false;
1855 1856
}

1857 1858 1859 1860 1861
void kvm_cpu_synchronize_post_init(CPUState *cpu)
{
    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, cpu);
}

1862
int kvm_cpu_exec(CPUState *cpu)
A
aliguori 已提交
1863
{
A
Andreas Färber 已提交
1864
    struct kvm_run *run = cpu->kvm_run;
1865
    int ret, run_ret;
A
aliguori 已提交
1866

1867
    DPRINTF("kvm_cpu_exec()\n");
A
aliguori 已提交
1868

A
Andreas Färber 已提交
1869
    if (kvm_arch_process_async_events(cpu)) {
1870
        cpu->exit_request = 0;
1871
        return EXCP_HLT;
1872
    }
M
Marcelo Tosatti 已提交
1873

1874 1875
    qemu_mutex_unlock_iothread();

1876
    do {
1877 1878
        MemTxAttrs attrs;

A
Andreas Färber 已提交
1879 1880 1881
        if (cpu->kvm_vcpu_dirty) {
            kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
            cpu->kvm_vcpu_dirty = false;
1882 1883
        }

A
Andreas Färber 已提交
1884
        kvm_arch_pre_run(cpu, run);
1885
        if (cpu->exit_request) {
1886 1887 1888 1889 1890 1891 1892 1893 1894
            DPRINTF("interrupt exit requested\n");
            /*
             * KVM requires us to reenter the kernel after IO exits to complete
             * instruction emulation. This self-signal will ensure that we
             * leave ASAP again.
             */
            qemu_cpu_kick_self();
        }

1895
        run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
1896

1897
        attrs = kvm_arch_post_run(cpu, run);
A
aliguori 已提交
1898

1899
        if (run_ret < 0) {
1900 1901
            if (run_ret == -EINTR || run_ret == -EAGAIN) {
                DPRINTF("io window exit\n");
1902
                ret = EXCP_INTERRUPT;
1903 1904
                break;
            }
1905 1906
            fprintf(stderr, "error: kvm run failed %s\n",
                    strerror(-run_ret));
1907 1908 1909 1910 1911 1912 1913 1914
#ifdef TARGET_PPC
            if (run_ret == -EBUSY) {
                fprintf(stderr,
                        "This is probably because your SMT is enabled.\n"
                        "VCPU can only run on primary threads with all "
                        "secondary threads offline.\n");
            }
#endif
1915 1916
            ret = -1;
            break;
A
aliguori 已提交
1917 1918
        }

1919
        trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
A
aliguori 已提交
1920 1921
        switch (run->exit_reason) {
        case KVM_EXIT_IO:
1922
            DPRINTF("handle_io\n");
J
Jan Kiszka 已提交
1923
            /* Called outside BQL */
1924
            kvm_handle_io(run->io.port, attrs,
1925 1926 1927 1928
                          (uint8_t *)run + run->io.data_offset,
                          run->io.direction,
                          run->io.size,
                          run->io.count);
1929
            ret = 0;
A
aliguori 已提交
1930 1931
            break;
        case KVM_EXIT_MMIO:
1932
            DPRINTF("handle_mmio\n");
P
Paolo Bonzini 已提交
1933
            /* Called outside BQL */
1934 1935 1936 1937 1938
            address_space_rw(&address_space_memory,
                             run->mmio.phys_addr, attrs,
                             run->mmio.data,
                             run->mmio.len,
                             run->mmio.is_write);
1939
            ret = 0;
A
aliguori 已提交
1940 1941
            break;
        case KVM_EXIT_IRQ_WINDOW_OPEN:
1942
            DPRINTF("irq_window_open\n");
1943
            ret = EXCP_INTERRUPT;
A
aliguori 已提交
1944 1945
            break;
        case KVM_EXIT_SHUTDOWN:
1946
            DPRINTF("shutdown\n");
A
aliguori 已提交
1947
            qemu_system_reset_request();
1948
            ret = EXCP_INTERRUPT;
A
aliguori 已提交
1949 1950
            break;
        case KVM_EXIT_UNKNOWN:
1951 1952
            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
                    (uint64_t)run->hw.hardware_exit_reason);
J
Jan Kiszka 已提交
1953
            ret = -1;
A
aliguori 已提交
1954
            break;
M
Marcelo Tosatti 已提交
1955
        case KVM_EXIT_INTERNAL_ERROR:
1956
            ret = kvm_handle_internal_error(cpu, run);
M
Marcelo Tosatti 已提交
1957
            break;
1958 1959 1960 1961 1962 1963 1964 1965 1966 1967
        case KVM_EXIT_SYSTEM_EVENT:
            switch (run->system_event.type) {
            case KVM_SYSTEM_EVENT_SHUTDOWN:
                qemu_system_shutdown_request();
                ret = EXCP_INTERRUPT;
                break;
            case KVM_SYSTEM_EVENT_RESET:
                qemu_system_reset_request();
                ret = EXCP_INTERRUPT;
                break;
1968 1969 1970 1971 1972 1973
            case KVM_SYSTEM_EVENT_CRASH:
                qemu_mutex_lock_iothread();
                qemu_system_guest_panicked();
                qemu_mutex_unlock_iothread();
                ret = 0;
                break;
1974 1975 1976 1977 1978 1979
            default:
                DPRINTF("kvm_arch_handle_exit\n");
                ret = kvm_arch_handle_exit(cpu, run);
                break;
            }
            break;
A
aliguori 已提交
1980
        default:
1981
            DPRINTF("kvm_arch_handle_exit\n");
A
Andreas Färber 已提交
1982
            ret = kvm_arch_handle_exit(cpu, run);
A
aliguori 已提交
1983 1984
            break;
        }
1985
    } while (ret == 0);
A
aliguori 已提交
1986

1987 1988
    qemu_mutex_lock_iothread();

J
Jan Kiszka 已提交
1989
    if (ret < 0) {
1990
        cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
1991
        vm_stop(RUN_STATE_INTERNAL_ERROR);
A
aliguori 已提交
1992 1993
    }

1994
    cpu->exit_request = 0;
A
aliguori 已提交
1995 1996 1997
    return ret;
}

1998
int kvm_ioctl(KVMState *s, int type, ...)
A
aliguori 已提交
1999 2000
{
    int ret;
2001 2002
    void *arg;
    va_list ap;
A
aliguori 已提交
2003

2004 2005 2006 2007
    va_start(ap, type);
    arg = va_arg(ap, void *);
    va_end(ap);

2008
    trace_kvm_ioctl(type, arg);
2009
    ret = ioctl(s->fd, type, arg);
J
Jan Kiszka 已提交
2010
    if (ret == -1) {
A
aliguori 已提交
2011
        ret = -errno;
J
Jan Kiszka 已提交
2012
    }
A
aliguori 已提交
2013 2014 2015
    return ret;
}

2016
int kvm_vm_ioctl(KVMState *s, int type, ...)
A
aliguori 已提交
2017 2018
{
    int ret;
2019 2020 2021 2022 2023 2024
    void *arg;
    va_list ap;

    va_start(ap, type);
    arg = va_arg(ap, void *);
    va_end(ap);
A
aliguori 已提交
2025

2026
    trace_kvm_vm_ioctl(type, arg);
2027
    ret = ioctl(s->vmfd, type, arg);
J
Jan Kiszka 已提交
2028
    if (ret == -1) {
A
aliguori 已提交
2029
        ret = -errno;
J
Jan Kiszka 已提交
2030
    }
A
aliguori 已提交
2031 2032 2033
    return ret;
}

2034
int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
A
aliguori 已提交
2035 2036
{
    int ret;
2037 2038 2039 2040 2041 2042
    void *arg;
    va_list ap;

    va_start(ap, type);
    arg = va_arg(ap, void *);
    va_end(ap);
A
aliguori 已提交
2043

2044
    trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
A
Andreas Färber 已提交
2045
    ret = ioctl(cpu->kvm_fd, type, arg);
J
Jan Kiszka 已提交
2046
    if (ret == -1) {
A
aliguori 已提交
2047
        ret = -errno;
J
Jan Kiszka 已提交
2048
    }
A
aliguori 已提交
2049 2050
    return ret;
}
A
aliguori 已提交
2051

2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069
int kvm_device_ioctl(int fd, int type, ...)
{
    int ret;
    void *arg;
    va_list ap;

    va_start(ap, type);
    arg = va_arg(ap, void *);
    va_end(ap);

    trace_kvm_device_ioctl(fd, type, arg);
    ret = ioctl(fd, type, arg);
    if (ret == -1) {
        ret = -errno;
    }
    return ret;
}

2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086
int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
{
    int ret;
    struct kvm_device_attr attribute = {
        .group = group,
        .attr = attr,
    };

    if (!kvm_vm_attributes_allowed) {
        return 0;
    }

    ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
    /* kvm returns 0 on success for HAS_DEVICE_ATTR */
    return ret ? 0 : 1;
}

2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112
int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
{
    struct kvm_device_attr attribute = {
        .group = group,
        .attr = attr,
        .flags = 0,
    };

    return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
}

void kvm_device_access(int fd, int group, uint64_t attr,
                       void *val, bool write)
{
    struct kvm_device_attr kvmattr;
    int err;

    kvmattr.flags = 0;
    kvmattr.group = group;
    kvmattr.attr = attr;
    kvmattr.addr = (uintptr_t)val;

    err = kvm_device_ioctl(fd,
                           write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
                           &kvmattr);
    if (err < 0) {
2113 2114 2115
        error_report("KVM_%s_DEVICE_ATTR failed: %s",
                     write ? "SET" : "GET", strerror(-err));
        error_printf("Group %d attr 0x%016" PRIx64, group, attr);
2116 2117 2118 2119
        abort();
    }
}

A
aliguori 已提交
2120 2121
int kvm_has_sync_mmu(void)
{
2122
    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
A
aliguori 已提交
2123
}
2124

2125 2126 2127 2128 2129
int kvm_has_vcpu_events(void)
{
    return kvm_state->vcpu_events;
}

2130 2131 2132 2133 2134
int kvm_has_robust_singlestep(void)
{
    return kvm_state->robust_singlestep;
}

2135 2136 2137 2138 2139
int kvm_has_debugregs(void)
{
    return kvm_state->debugregs;
}

2140 2141 2142 2143 2144 2145 2146 2147
int kvm_has_many_ioeventfds(void)
{
    if (!kvm_enabled()) {
        return 0;
    }
    return kvm_state->many_ioeventfds;
}

2148 2149
int kvm_has_gsi_routing(void)
{
A
Alexander Graf 已提交
2150
#ifdef KVM_CAP_IRQ_ROUTING
2151
    return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
A
Alexander Graf 已提交
2152 2153 2154
#else
    return false;
#endif
2155 2156
}

2157 2158 2159 2160 2161
int kvm_has_intx_set_mask(void)
{
    return kvm_state->intx_set_mask;
}

2162 2163 2164
void kvm_setup_guest_memory(void *start, size_t size)
{
    if (!kvm_has_sync_mmu()) {
A
Andreas Färber 已提交
2165
        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
2166 2167

        if (ret) {
A
Andreas Färber 已提交
2168 2169 2170
            perror("qemu_madvise");
            fprintf(stderr,
                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
2171 2172 2173 2174 2175
            exit(1);
        }
    }
}

2176
#ifdef KVM_CAP_SET_GUEST_DEBUG
2177
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
2178 2179 2180 2181
                                                 target_ulong pc)
{
    struct kvm_sw_breakpoint *bp;

2182
    QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
J
Jan Kiszka 已提交
2183
        if (bp->pc == pc) {
2184
            return bp;
J
Jan Kiszka 已提交
2185
        }
2186 2187 2188 2189
    }
    return NULL;
}

2190
int kvm_sw_breakpoints_active(CPUState *cpu)
2191
{
2192
    return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
2193 2194
}

G
Glauber Costa 已提交
2195 2196
struct kvm_set_guest_debug_data {
    struct kvm_guest_debug dbg;
2197
    CPUState *cpu;
G
Glauber Costa 已提交
2198 2199 2200 2201 2202 2203
    int err;
};

static void kvm_invoke_set_guest_debug(void *data)
{
    struct kvm_set_guest_debug_data *dbg_data = data;
J
Jan Kiszka 已提交
2204

2205 2206
    dbg_data->err = kvm_vcpu_ioctl(dbg_data->cpu, KVM_SET_GUEST_DEBUG,
                                   &dbg_data->dbg);
G
Glauber Costa 已提交
2207 2208
}

2209
int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2210
{
G
Glauber Costa 已提交
2211
    struct kvm_set_guest_debug_data data;
2212

2213
    data.dbg.control = reinject_trap;
2214

2215
    if (cpu->singlestep_enabled) {
2216 2217
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
    }
A
Andreas Färber 已提交
2218
    kvm_arch_update_guest_debug(cpu, &data.dbg);
2219
    data.cpu = cpu;
2220

2221
    run_on_cpu(cpu, kvm_invoke_set_guest_debug, &data);
G
Glauber Costa 已提交
2222
    return data.err;
2223 2224
}

2225
int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2226 2227 2228 2229 2230 2231
                          target_ulong len, int type)
{
    struct kvm_sw_breakpoint *bp;
    int err;

    if (type == GDB_BREAKPOINT_SW) {
2232
        bp = kvm_find_sw_breakpoint(cpu, addr);
2233 2234 2235 2236 2237
        if (bp) {
            bp->use_count++;
            return 0;
        }

2238
        bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
2239 2240
        bp->pc = addr;
        bp->use_count = 1;
2241
        err = kvm_arch_insert_sw_breakpoint(cpu, bp);
2242
        if (err) {
2243
            g_free(bp);
2244 2245 2246
            return err;
        }

2247
        QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2248 2249
    } else {
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
J
Jan Kiszka 已提交
2250
        if (err) {
2251
            return err;
J
Jan Kiszka 已提交
2252
        }
2253 2254
    }

A
Andreas Färber 已提交
2255
    CPU_FOREACH(cpu) {
2256
        err = kvm_update_guest_debug(cpu, 0);
J
Jan Kiszka 已提交
2257
        if (err) {
2258
            return err;
J
Jan Kiszka 已提交
2259
        }
2260 2261 2262 2263
    }
    return 0;
}

2264
int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2265 2266 2267 2268 2269 2270
                          target_ulong len, int type)
{
    struct kvm_sw_breakpoint *bp;
    int err;

    if (type == GDB_BREAKPOINT_SW) {
2271
        bp = kvm_find_sw_breakpoint(cpu, addr);
J
Jan Kiszka 已提交
2272
        if (!bp) {
2273
            return -ENOENT;
J
Jan Kiszka 已提交
2274
        }
2275 2276 2277 2278 2279 2280

        if (bp->use_count > 1) {
            bp->use_count--;
            return 0;
        }

2281
        err = kvm_arch_remove_sw_breakpoint(cpu, bp);
J
Jan Kiszka 已提交
2282
        if (err) {
2283
            return err;
J
Jan Kiszka 已提交
2284
        }
2285

2286
        QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2287
        g_free(bp);
2288 2289
    } else {
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
J
Jan Kiszka 已提交
2290
        if (err) {
2291
            return err;
J
Jan Kiszka 已提交
2292
        }
2293 2294
    }

A
Andreas Färber 已提交
2295
    CPU_FOREACH(cpu) {
2296
        err = kvm_update_guest_debug(cpu, 0);
J
Jan Kiszka 已提交
2297
        if (err) {
2298
            return err;
J
Jan Kiszka 已提交
2299
        }
2300 2301 2302 2303
    }
    return 0;
}

2304
void kvm_remove_all_breakpoints(CPUState *cpu)
2305 2306
{
    struct kvm_sw_breakpoint *bp, *next;
2307
    KVMState *s = cpu->kvm_state;
2308
    CPUState *tmpcpu;
2309

B
Blue Swirl 已提交
2310
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
2311
        if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
2312
            /* Try harder to find a CPU that currently sees the breakpoint. */
2313 2314
            CPU_FOREACH(tmpcpu) {
                if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
2315
                    break;
J
Jan Kiszka 已提交
2316
                }
2317 2318
            }
        }
2319 2320
        QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
        g_free(bp);
2321 2322 2323
    }
    kvm_arch_remove_all_hw_breakpoints();

A
Andreas Färber 已提交
2324
    CPU_FOREACH(cpu) {
2325
        kvm_update_guest_debug(cpu, 0);
J
Jan Kiszka 已提交
2326
    }
2327 2328 2329 2330
}

#else /* !KVM_CAP_SET_GUEST_DEBUG */

2331
int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2332 2333 2334 2335
{
    return -EINVAL;
}

2336
int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2337 2338 2339 2340 2341
                          target_ulong len, int type)
{
    return -EINVAL;
}

2342
int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2343 2344 2345 2346 2347
                          target_ulong len, int type)
{
    return -EINVAL;
}

2348
void kvm_remove_all_breakpoints(CPUState *cpu)
2349 2350 2351
{
}
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
2352

2353
int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
2354
{
2355
    KVMState *s = kvm_state;
2356 2357 2358
    struct kvm_signal_mask *sigmask;
    int r;

J
Jan Kiszka 已提交
2359
    if (!sigset) {
2360
        return kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, NULL);
J
Jan Kiszka 已提交
2361
    }
2362

2363
    sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
2364

2365
    sigmask->len = s->sigmask_len;
2366
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
2367
    r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
2368
    g_free(sigmask);
2369 2370 2371

    return r;
}
2372
int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
2373
{
A
Andreas Färber 已提交
2374
    return kvm_arch_on_sigbus_vcpu(cpu, code, addr);
2375 2376 2377 2378 2379 2380
}

int kvm_on_sigbus(int code, void *addr)
{
    return kvm_arch_on_sigbus(code, addr);
}
2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401

int kvm_create_device(KVMState *s, uint64_t type, bool test)
{
    int ret;
    struct kvm_create_device create_dev;

    create_dev.type = type;
    create_dev.fd = -1;
    create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;

    if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
        return -ENOTSUP;
    }

    ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
    if (ret) {
        return ret;
    }

    return test ? 0 : create_dev.fd;
}
2402

2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417
bool kvm_device_supported(int vmfd, uint64_t type)
{
    struct kvm_create_device create_dev = {
        .type = type,
        .fd = -1,
        .flags = KVM_CREATE_DEVICE_TEST,
    };

    if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
        return false;
    }

    return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
}

2418 2419 2420 2421 2422 2423 2424 2425 2426
int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
{
    struct kvm_one_reg reg;
    int r;

    reg.id = id;
    reg.addr = (uintptr_t) source;
    r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
    if (r) {
A
Andrew Jones 已提交
2427
        trace_kvm_failed_reg_set(id, strerror(-r));
2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440
    }
    return r;
}

int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
{
    struct kvm_one_reg reg;
    int r;

    reg.id = id;
    reg.addr = (uintptr_t) target;
    r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
    if (r) {
A
Andrew Jones 已提交
2441
        trace_kvm_failed_reg_get(id, strerror(-r));
2442 2443 2444
    }
    return r;
}
2445 2446 2447 2448 2449

static void kvm_accel_class_init(ObjectClass *oc, void *data)
{
    AccelClass *ac = ACCEL_CLASS(oc);
    ac->name = "KVM";
2450
    ac->init_machine = kvm_init;
2451 2452 2453 2454 2455 2456 2457
    ac->allowed = &kvm_allowed;
}

static const TypeInfo kvm_accel_type = {
    .name = TYPE_KVM_ACCEL,
    .parent = TYPE_ACCEL,
    .class_init = kvm_accel_class_init,
2458
    .instance_size = sizeof(KVMState),
2459 2460 2461 2462 2463 2464 2465 2466
};

static void kvm_type_init(void)
{
    type_register_static(&kvm_accel_type);
}

type_init(kvm_type_init);