kvm-all.c 51.4 KB
Newer Older
A
aliguori 已提交
1 2 3 4
/*
 * QEMU KVM support
 *
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
A
aliguori 已提交
6 7 8
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
A
aliguori 已提交
10 11 12 13 14 15 16 17 18
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 *
 */

#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
19
#include <stdarg.h>
A
aliguori 已提交
20 21 22 23

#include <linux/kvm.h>

#include "qemu-common.h"
24 25 26
#include "qemu/atomic.h"
#include "qemu/option.h"
#include "qemu/config-file.h"
27
#include "sysemu/sysemu.h"
J
Jan Kiszka 已提交
28
#include "hw/hw.h"
29
#include "hw/pci/msi.h"
30
#include "exec/gdbstub.h"
31
#include "sysemu/kvm.h"
32
#include "qemu/bswap.h"
33 34
#include "exec/memory.h"
#include "exec/address-spaces.h"
35
#include "qemu/event_notifier.h"
36
#include "trace.h"
A
aliguori 已提交
37

38 39 40 41 42
/* This check must be after config-host.h is included */
#ifdef CONFIG_EVENTFD
#include <sys/eventfd.h>
#endif

43 44 45 46
#ifdef CONFIG_VALGRIND_H
#include <valgrind/memcheck.h>
#endif

47
/* KVM uses PAGE_SIZE in its definition of COALESCED_MMIO_MAX */
A
aliguori 已提交
48 49
#define PAGE_SIZE TARGET_PAGE_SIZE

A
aliguori 已提交
50 51 52
//#define DEBUG_KVM

#ifdef DEBUG_KVM
53
#define DPRINTF(fmt, ...) \
A
aliguori 已提交
54 55
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
#else
56
#define DPRINTF(fmt, ...) \
A
aliguori 已提交
57 58 59
    do { } while (0)
#endif

60 61
#define KVM_MSI_HASHTAB_SIZE    256

A
aliguori 已提交
62 63
typedef struct KVMSlot
{
A
Avi Kivity 已提交
64
    hwaddr start_addr;
A
Anthony Liguori 已提交
65
    ram_addr_t memory_size;
66
    void *ram;
A
aliguori 已提交
67 68 69
    int slot;
    int flags;
} KVMSlot;
A
aliguori 已提交
70

71 72
typedef struct kvm_dirty_log KVMDirtyLog;

A
aliguori 已提交
73 74 75 76 77
struct KVMState
{
    KVMSlot slots[32];
    int fd;
    int vmfd;
A
aliguori 已提交
78
    int coalesced_mmio;
79
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
80
    bool coalesced_flush_in_progress;
81
    int broken_set_mem_region;
82
    int migration_log;
83
    int vcpu_events;
84
    int robust_singlestep;
85
    int debugregs;
86 87 88
#ifdef KVM_CAP_SET_GUEST_DEBUG
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
#endif
J
Jan Kiszka 已提交
89
    int pit_state2;
90
    int xsave, xcrs;
91
    int many_ioeventfds;
92
    int intx_set_mask;
93 94 95
    /* The man page (and posix) say ioctl numbers are signed int, but
     * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
     * unsigned, and treating them as signed here can break things */
96
    unsigned irq_set_ioctl;
97 98 99 100
#ifdef KVM_CAP_IRQ_ROUTING
    struct kvm_irq_routing *irq_routes;
    int nr_allocated_irq_routes;
    uint32_t *used_gsi_bitmap;
101
    unsigned int gsi_count;
102
    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
103
    bool direct_msi;
104
#endif
A
aliguori 已提交
105 106
};

107
KVMState *kvm_state;
108
bool kvm_kernel_irqchip;
109
bool kvm_async_interrupts_allowed;
110
bool kvm_irqfds_allowed;
111
bool kvm_msi_via_irqfd_allowed;
112
bool kvm_gsi_routing_allowed;
113
bool kvm_allowed;
A
aliguori 已提交
114

115 116 117 118 119 120
static const KVMCapabilityInfo kvm_required_capabilites[] = {
    KVM_CAP_INFO(USER_MEMORY),
    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
    KVM_CAP_LAST_INFO
};

A
aliguori 已提交
121 122 123 124 125
static KVMSlot *kvm_alloc_slot(KVMState *s)
{
    int i;

    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
J
Jan Kiszka 已提交
126
        if (s->slots[i].memory_size == 0) {
A
aliguori 已提交
127
            return &s->slots[i];
J
Jan Kiszka 已提交
128
        }
A
aliguori 已提交
129 130
    }

131 132 133 134 135
    fprintf(stderr, "%s: no free slot available\n", __func__);
    abort();
}

static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
A
Avi Kivity 已提交
136 137
                                         hwaddr start_addr,
                                         hwaddr end_addr)
138 139 140 141 142 143 144 145 146 147 148 149
{
    int i;

    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
        KVMSlot *mem = &s->slots[i];

        if (start_addr == mem->start_addr &&
            end_addr == mem->start_addr + mem->memory_size) {
            return mem;
        }
    }

A
aliguori 已提交
150 151 152
    return NULL;
}

153 154 155 156
/*
 * Find overlapping slot with lowest start address
 */
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
A
Avi Kivity 已提交
157 158
                                            hwaddr start_addr,
                                            hwaddr end_addr)
A
aliguori 已提交
159
{
160
    KVMSlot *found = NULL;
A
aliguori 已提交
161 162 163 164 165
    int i;

    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
        KVMSlot *mem = &s->slots[i];

166 167 168 169 170 171 172 173 174
        if (mem->memory_size == 0 ||
            (found && found->start_addr < mem->start_addr)) {
            continue;
        }

        if (end_addr > mem->start_addr &&
            start_addr < mem->start_addr + mem->memory_size) {
            found = mem;
        }
A
aliguori 已提交
175 176
    }

177
    return found;
A
aliguori 已提交
178 179
}

180
int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
A
Avi Kivity 已提交
181
                                       hwaddr *phys_addr)
182 183 184 185 186 187
{
    int i;

    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
        KVMSlot *mem = &s->slots[i];

188 189
        if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
            *phys_addr = mem->start_addr + (ram - mem->ram);
190 191 192 193 194 195 196
            return 1;
        }
    }

    return 0;
}

197 198 199 200 201 202 203
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
{
    struct kvm_userspace_memory_region mem;

    mem.slot = slot->slot;
    mem.guest_phys_addr = slot->start_addr;
    mem.memory_size = slot->memory_size;
204
    mem.userspace_addr = (unsigned long)slot->ram;
205
    mem.flags = slot->flags;
206 207 208
    if (s->migration_log) {
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
    }
209 210 211
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
}

J
Jan Kiszka 已提交
212 213
static void kvm_reset_vcpu(void *opaque)
{
A
Andreas Färber 已提交
214
    CPUState *cpu = opaque;
J
Jan Kiszka 已提交
215

A
Andreas Färber 已提交
216
    kvm_arch_reset_vcpu(cpu);
J
Jan Kiszka 已提交
217
}
218

219
int kvm_init_vcpu(CPUState *cpu)
A
aliguori 已提交
220 221 222 223 224
{
    KVMState *s = kvm_state;
    long mmap_size;
    int ret;

225
    DPRINTF("kvm_init_vcpu\n");
A
aliguori 已提交
226

227
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
A
aliguori 已提交
228
    if (ret < 0) {
229
        DPRINTF("kvm_create_vcpu failed\n");
A
aliguori 已提交
230 231 232
        goto err;
    }

A
Andreas Färber 已提交
233
    cpu->kvm_fd = ret;
234
    cpu->kvm_state = s;
A
Andreas Färber 已提交
235
    cpu->kvm_vcpu_dirty = true;
A
aliguori 已提交
236 237 238

    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
    if (mmap_size < 0) {
239
        ret = mmap_size;
240
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
A
aliguori 已提交
241 242 243
        goto err;
    }

A
Andreas Färber 已提交
244
    cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
A
Andreas Färber 已提交
245
                        cpu->kvm_fd, 0);
A
Andreas Färber 已提交
246
    if (cpu->kvm_run == MAP_FAILED) {
A
aliguori 已提交
247
        ret = -errno;
248
        DPRINTF("mmap'ing vcpu state failed\n");
A
aliguori 已提交
249 250 251
        goto err;
    }

J
Jan Kiszka 已提交
252 253
    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
        s->coalesced_mmio_ring =
A
Andreas Färber 已提交
254
            (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
J
Jan Kiszka 已提交
255
    }
256

A
Andreas Färber 已提交
257
    ret = kvm_arch_init_vcpu(cpu);
J
Jan Kiszka 已提交
258
    if (ret == 0) {
A
Andreas Färber 已提交
259 260
        qemu_register_reset(kvm_reset_vcpu, cpu);
        kvm_arch_reset_vcpu(cpu);
J
Jan Kiszka 已提交
261
    }
A
aliguori 已提交
262 263 264 265
err:
    return ret;
}

266 267 268
/*
 * dirty pages logging control
 */
269 270 271 272 273 274 275

static int kvm_mem_flags(KVMState *s, bool log_dirty)
{
    return log_dirty ? KVM_MEM_LOG_DIRTY_PAGES : 0;
}

static int kvm_slot_dirty_pages_log_change(KVMSlot *mem, bool log_dirty)
276 277
{
    KVMState *s = kvm_state;
278
    int flags, mask = KVM_MEM_LOG_DIRTY_PAGES;
279 280 281
    int old_flags;

    old_flags = mem->flags;
282

283
    flags = (mem->flags & ~mask) | kvm_mem_flags(s, log_dirty);
284 285
    mem->flags = flags;

286 287 288 289
    /* If nothing changed effectively, no need to issue ioctl */
    if (s->migration_log) {
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
    }
290

291
    if (flags == old_flags) {
292
        return 0;
293 294
    }

295 296 297
    return kvm_set_user_memory_region(s, mem);
}

A
Avi Kivity 已提交
298
static int kvm_dirty_pages_log_change(hwaddr phys_addr,
299 300 301 302 303 304 305 306
                                      ram_addr_t size, bool log_dirty)
{
    KVMState *s = kvm_state;
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);

    if (mem == NULL)  {
        fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
                TARGET_FMT_plx "\n", __func__, phys_addr,
A
Avi Kivity 已提交
307
                (hwaddr)(phys_addr + size - 1));
308 309 310 311 312
        return -EINVAL;
    }
    return kvm_slot_dirty_pages_log_change(mem, log_dirty);
}

A
Avi Kivity 已提交
313 314
static void kvm_log_start(MemoryListener *listener,
                          MemoryRegionSection *section)
315
{
A
Avi Kivity 已提交
316 317 318 319 320 321 322
    int r;

    r = kvm_dirty_pages_log_change(section->offset_within_address_space,
                                   section->size, true);
    if (r < 0) {
        abort();
    }
323 324
}

A
Avi Kivity 已提交
325 326
static void kvm_log_stop(MemoryListener *listener,
                          MemoryRegionSection *section)
327
{
A
Avi Kivity 已提交
328 329 330 331 332 333 334
    int r;

    r = kvm_dirty_pages_log_change(section->offset_within_address_space,
                                   section->size, false);
    if (r < 0) {
        abort();
    }
335 336
}

337
static int kvm_set_migration_log(int enable)
338 339 340 341 342 343 344 345 346 347
{
    KVMState *s = kvm_state;
    KVMSlot *mem;
    int i, err;

    s->migration_log = enable;

    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
        mem = &s->slots[i];

348 349 350
        if (!mem->memory_size) {
            continue;
        }
351 352 353 354 355 356 357 358 359 360 361
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
            continue;
        }
        err = kvm_set_user_memory_region(s, mem);
        if (err) {
            return err;
        }
    }
    return 0;
}

362
/* get kvm's dirty pages bitmap and update qemu's */
363 364
static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
                                         unsigned long *bitmap)
A
Alexander Graf 已提交
365
{
366
    unsigned int i, j;
367
    unsigned long page_number, c;
A
Avi Kivity 已提交
368
    hwaddr addr, addr1;
369
    unsigned int len = ((section->size / getpagesize()) + HOST_LONG_BITS - 1) / HOST_LONG_BITS;
370
    unsigned long hpratio = getpagesize() / TARGET_PAGE_SIZE;
371 372 373 374 375 376 377 378 379 380 381

    /*
     * bitmap-traveling is faster than memory-traveling (for addr...)
     * especially when most of the memory is not dirty.
     */
    for (i = 0; i < len; i++) {
        if (bitmap[i] != 0) {
            c = leul_to_cpu(bitmap[i]);
            do {
                j = ffsl(c) - 1;
                c &= ~(1ul << j);
382
                page_number = (i * HOST_LONG_BITS + j) * hpratio;
383
                addr1 = page_number * TARGET_PAGE_SIZE;
384
                addr = section->offset_within_region + addr1;
385 386
                memory_region_set_dirty(section->mr, addr,
                                        TARGET_PAGE_SIZE * hpratio);
387 388 389 390
            } while (c != 0);
        }
    }
    return 0;
A
Alexander Graf 已提交
391 392
}

393 394
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))

395 396
/**
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
397 398 399
 * This function updates qemu's dirty bitmap using
 * memory_region_set_dirty().  This means all bits are set
 * to dirty.
400
 *
401
 * @start_add: start of logged region.
402 403
 * @end_addr: end of logged region.
 */
404
static int kvm_physical_sync_dirty_bitmap(MemoryRegionSection *section)
405 406
{
    KVMState *s = kvm_state;
407 408 409 410
    unsigned long size, allocated_size = 0;
    KVMDirtyLog d;
    KVMSlot *mem;
    int ret = 0;
A
Avi Kivity 已提交
411 412
    hwaddr start_addr = section->offset_within_address_space;
    hwaddr end_addr = start_addr + section->size;
413

414 415 416 417 418 419
    d.dirty_bitmap = NULL;
    while (start_addr < end_addr) {
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
        if (mem == NULL) {
            break;
        }
420

421 422 423 424 425 426 427 428 429 430 431 432 433 434
        /* XXX bad kernel interface alert
         * For dirty bitmap, kernel allocates array of size aligned to
         * bits-per-long.  But for case when the kernel is 64bits and
         * the userspace is 32bits, userspace can't align to the same
         * bits-per-long, since sizeof(long) is different between kernel
         * and user space.  This way, userspace will provide buffer which
         * may be 4 bytes less than the kernel will use, resulting in
         * userspace memory corruption (which is not detectable by valgrind
         * too, in most cases).
         * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
         * a hope that sizeof(long) wont become >8 any time soon.
         */
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
                     /*HOST_LONG_BITS*/ 64) / 8;
435
        if (!d.dirty_bitmap) {
436
            d.dirty_bitmap = g_malloc(size);
437
        } else if (size > allocated_size) {
438
            d.dirty_bitmap = g_realloc(d.dirty_bitmap, size);
439 440 441
        }
        allocated_size = size;
        memset(d.dirty_bitmap, 0, allocated_size);
442

443
        d.slot = mem->slot;
444

445
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
446
            DPRINTF("ioctl failed %d\n", errno);
447 448 449
            ret = -1;
            break;
        }
450

451
        kvm_get_dirty_pages_log_range(section, d.dirty_bitmap);
452
        start_addr = mem->start_addr + mem->memory_size;
453
    }
454
    g_free(d.dirty_bitmap);
455 456

    return ret;
457 458
}

459 460
static void kvm_coalesce_mmio_region(MemoryListener *listener,
                                     MemoryRegionSection *secion,
A
Avi Kivity 已提交
461
                                     hwaddr start, hwaddr size)
A
aliguori 已提交
462 463 464 465 466 467 468 469
{
    KVMState *s = kvm_state;

    if (s->coalesced_mmio) {
        struct kvm_coalesced_mmio_zone zone;

        zone.addr = start;
        zone.size = size;
470
        zone.pad = 0;
A
aliguori 已提交
471

472
        (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
A
aliguori 已提交
473 474 475
    }
}

476 477
static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
                                       MemoryRegionSection *secion,
A
Avi Kivity 已提交
478
                                       hwaddr start, hwaddr size)
A
aliguori 已提交
479 480 481 482 483 484 485 486
{
    KVMState *s = kvm_state;

    if (s->coalesced_mmio) {
        struct kvm_coalesced_mmio_zone zone;

        zone.addr = start;
        zone.size = size;
487
        zone.pad = 0;
A
aliguori 已提交
488

489
        (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
A
aliguori 已提交
490 491 492
    }
}

493 494 495 496 497 498 499 500 501 502 503 504
int kvm_check_extension(KVMState *s, unsigned int extension)
{
    int ret;

    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
    if (ret < 0) {
        ret = 0;
    }

    return ret;
}

505
static int kvm_set_ioeventfd_mmio(int fd, uint32_t addr, uint32_t val,
506
                                  bool assign, uint32_t size, bool datamatch)
M
Michael S. Tsirkin 已提交
507 508 509 510
{
    int ret;
    struct kvm_ioeventfd iofd;

511
    iofd.datamatch = datamatch ? val : 0;
M
Michael S. Tsirkin 已提交
512 513
    iofd.addr = addr;
    iofd.len = size;
514
    iofd.flags = 0;
M
Michael S. Tsirkin 已提交
515 516 517 518 519 520
    iofd.fd = fd;

    if (!kvm_enabled()) {
        return -ENOSYS;
    }

521 522 523
    if (datamatch) {
        iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
    }
M
Michael S. Tsirkin 已提交
524 525 526 527 528 529 530 531 532 533 534 535 536
    if (!assign) {
        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
    }

    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);

    if (ret < 0) {
        return -errno;
    }

    return 0;
}

537
static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
538
                                 bool assign, uint32_t size, bool datamatch)
M
Michael S. Tsirkin 已提交
539 540
{
    struct kvm_ioeventfd kick = {
541
        .datamatch = datamatch ? val : 0,
M
Michael S. Tsirkin 已提交
542
        .addr = addr,
543
        .flags = KVM_IOEVENTFD_FLAG_PIO,
544
        .len = size,
M
Michael S. Tsirkin 已提交
545 546 547 548 549 550
        .fd = fd,
    };
    int r;
    if (!kvm_enabled()) {
        return -ENOSYS;
    }
551 552 553
    if (datamatch) {
        kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
    }
M
Michael S. Tsirkin 已提交
554 555 556 557 558 559 560 561 562 563 564
    if (!assign) {
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
    }
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
    if (r < 0) {
        return r;
    }
    return 0;
}


565 566
static int kvm_check_many_ioeventfds(void)
{
567 568 569 570 571
    /* Userspace can use ioeventfd for io notification.  This requires a host
     * that supports eventfd(2) and an I/O thread; since eventfd does not
     * support SIGIO it cannot interrupt the vcpu.
     *
     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
572 573
     * can avoid creating too many ioeventfds.
     */
574
#if defined(CONFIG_EVENTFD)
575 576 577 578 579 580 581
    int ioeventfds[7];
    int i, ret = 0;
    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
        if (ioeventfds[i] < 0) {
            break;
        }
582
        ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
583 584 585 586 587 588 589 590 591 592
        if (ret < 0) {
            close(ioeventfds[i]);
            break;
        }
    }

    /* Decide whether many devices are supported or not */
    ret = i == ARRAY_SIZE(ioeventfds);

    while (i-- > 0) {
593
        kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
594 595 596 597 598 599 600 601
        close(ioeventfds[i]);
    }
    return ret;
#else
    return 0;
#endif
}

602 603 604 605 606 607 608 609 610 611 612 613
static const KVMCapabilityInfo *
kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
{
    while (list->name) {
        if (!kvm_check_extension(s, list->value)) {
            return list;
        }
        list++;
    }
    return NULL;
}

A
Avi Kivity 已提交
614
static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
615 616 617 618
{
    KVMState *s = kvm_state;
    KVMSlot *mem, old;
    int err;
A
Avi Kivity 已提交
619 620
    MemoryRegion *mr = section->mr;
    bool log_dirty = memory_region_is_logging(mr);
A
Avi Kivity 已提交
621
    hwaddr start_addr = section->offset_within_address_space;
A
Avi Kivity 已提交
622
    ram_addr_t size = section->size;
623
    void *ram = NULL;
A
Avi Kivity 已提交
624
    unsigned delta;
625

626 627
    /* kvm works in page size chunks, but the function may be called
       with sub-page size and unaligned start address. */
A
Avi Kivity 已提交
628 629 630 631 632 633 634 635 636 637
    delta = TARGET_PAGE_ALIGN(size) - size;
    if (delta > size) {
        return;
    }
    start_addr += delta;
    size -= delta;
    size &= TARGET_PAGE_MASK;
    if (!size || (start_addr & ~TARGET_PAGE_MASK)) {
        return;
    }
638

A
Avi Kivity 已提交
639 640
    if (!memory_region_is_ram(mr)) {
        return;
641 642
    }

A
Avi Kivity 已提交
643
    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + delta;
A
Avi Kivity 已提交
644

645 646 647 648 649 650
    while (1) {
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
        if (!mem) {
            break;
        }

A
Avi Kivity 已提交
651
        if (add && start_addr >= mem->start_addr &&
652
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
653
            (ram - start_addr == mem->ram - mem->start_addr)) {
654
            /* The new slot fits into the existing one and comes with
655 656
             * identical parameters - update flags and done. */
            kvm_slot_dirty_pages_log_change(mem, log_dirty);
657 658 659 660 661
            return;
        }

        old = *mem;

662 663 664 665
        if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
            kvm_physical_sync_dirty_bitmap(section);
        }

666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
        /* unregister the overlapping slot */
        mem->memory_size = 0;
        err = kvm_set_user_memory_region(s, mem);
        if (err) {
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
                    __func__, strerror(-err));
            abort();
        }

        /* Workaround for older KVM versions: we can't join slots, even not by
         * unregistering the previous ones and then registering the larger
         * slot. We have to maintain the existing fragmentation. Sigh.
         *
         * This workaround assumes that the new slot starts at the same
         * address as the first existing one. If not or if some overlapping
         * slot comes around later, we will fail (not seen in practice so far)
         * - and actually require a recent KVM version. */
        if (s->broken_set_mem_region &&
A
Avi Kivity 已提交
684
            old.start_addr == start_addr && old.memory_size < size && add) {
685 686 687
            mem = kvm_alloc_slot(s);
            mem->memory_size = old.memory_size;
            mem->start_addr = old.start_addr;
688
            mem->ram = old.ram;
689
            mem->flags = kvm_mem_flags(s, log_dirty);
690 691 692 693 694 695 696 697 698

            err = kvm_set_user_memory_region(s, mem);
            if (err) {
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
                        strerror(-err));
                abort();
            }

            start_addr += old.memory_size;
699
            ram += old.memory_size;
700 701 702 703 704 705 706 707 708
            size -= old.memory_size;
            continue;
        }

        /* register prefix slot */
        if (old.start_addr < start_addr) {
            mem = kvm_alloc_slot(s);
            mem->memory_size = start_addr - old.start_addr;
            mem->start_addr = old.start_addr;
709
            mem->ram = old.ram;
710
            mem->flags =  kvm_mem_flags(s, log_dirty);
711 712 713 714 715

            err = kvm_set_user_memory_region(s, mem);
            if (err) {
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
                        __func__, strerror(-err));
716 717 718 719 720
#ifdef TARGET_PPC
                fprintf(stderr, "%s: This is probably because your kernel's " \
                                "PAGE_SIZE is too big. Please try to use 4k " \
                                "PAGE_SIZE!\n", __func__);
#endif
721 722 723 724 725 726 727 728 729 730 731 732
                abort();
            }
        }

        /* register suffix slot */
        if (old.start_addr + old.memory_size > start_addr + size) {
            ram_addr_t size_delta;

            mem = kvm_alloc_slot(s);
            mem->start_addr = start_addr + size;
            size_delta = mem->start_addr - old.start_addr;
            mem->memory_size = old.memory_size - size_delta;
733
            mem->ram = old.ram + size_delta;
734
            mem->flags = kvm_mem_flags(s, log_dirty);
735 736 737 738 739 740 741 742 743 744 745

            err = kvm_set_user_memory_region(s, mem);
            if (err) {
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
                        __func__, strerror(-err));
                abort();
            }
        }
    }

    /* in case the KVM bug workaround already "consumed" the new slot */
J
Jan Kiszka 已提交
746
    if (!size) {
747
        return;
J
Jan Kiszka 已提交
748
    }
A
Avi Kivity 已提交
749
    if (!add) {
750
        return;
J
Jan Kiszka 已提交
751
    }
752 753 754
    mem = kvm_alloc_slot(s);
    mem->memory_size = size;
    mem->start_addr = start_addr;
755
    mem->ram = ram;
756
    mem->flags = kvm_mem_flags(s, log_dirty);
757 758 759 760 761 762 763 764 765

    err = kvm_set_user_memory_region(s, mem);
    if (err) {
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
                strerror(-err));
        abort();
    }
}

A
Avi Kivity 已提交
766 767 768 769 770 771 772 773 774 775 776 777 778 779
static void kvm_region_add(MemoryListener *listener,
                           MemoryRegionSection *section)
{
    kvm_set_phys_mem(section, true);
}

static void kvm_region_del(MemoryListener *listener,
                           MemoryRegionSection *section)
{
    kvm_set_phys_mem(section, false);
}

static void kvm_log_sync(MemoryListener *listener,
                         MemoryRegionSection *section)
780
{
A
Avi Kivity 已提交
781 782
    int r;

783
    r = kvm_physical_sync_dirty_bitmap(section);
A
Avi Kivity 已提交
784 785 786
    if (r < 0) {
        abort();
    }
787 788
}

A
Avi Kivity 已提交
789
static void kvm_log_global_start(struct MemoryListener *listener)
790
{
A
Avi Kivity 已提交
791 792 793 794
    int r;

    r = kvm_set_migration_log(1);
    assert(r >= 0);
795 796
}

A
Avi Kivity 已提交
797
static void kvm_log_global_stop(struct MemoryListener *listener)
798
{
A
Avi Kivity 已提交
799 800 801 802
    int r;

    r = kvm_set_migration_log(0);
    assert(r >= 0);
803 804
}

805 806 807 808 809 810
static void kvm_mem_ioeventfd_add(MemoryListener *listener,
                                  MemoryRegionSection *section,
                                  bool match_data, uint64_t data,
                                  EventNotifier *e)
{
    int fd = event_notifier_get_fd(e);
811 812
    int r;

813
    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
814
                               data, true, section->size, match_data);
815 816 817 818 819
    if (r < 0) {
        abort();
    }
}

820 821 822 823
static void kvm_mem_ioeventfd_del(MemoryListener *listener,
                                  MemoryRegionSection *section,
                                  bool match_data, uint64_t data,
                                  EventNotifier *e)
824
{
825
    int fd = event_notifier_get_fd(e);
826 827
    int r;

828
    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
829
                               data, false, section->size, match_data);
830 831 832 833 834
    if (r < 0) {
        abort();
    }
}

835 836 837 838
static void kvm_io_ioeventfd_add(MemoryListener *listener,
                                 MemoryRegionSection *section,
                                 bool match_data, uint64_t data,
                                 EventNotifier *e)
839
{
840
    int fd = event_notifier_get_fd(e);
841 842
    int r;

843
    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
844
                              data, true, section->size, match_data);
845 846 847 848 849
    if (r < 0) {
        abort();
    }
}

850 851 852 853
static void kvm_io_ioeventfd_del(MemoryListener *listener,
                                 MemoryRegionSection *section,
                                 bool match_data, uint64_t data,
                                 EventNotifier *e)
854 855

{
856
    int fd = event_notifier_get_fd(e);
857 858
    int r;

859
    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
860
                              data, false, section->size, match_data);
861 862 863 864 865
    if (r < 0) {
        abort();
    }
}

A
Avi Kivity 已提交
866 867 868
static MemoryListener kvm_memory_listener = {
    .region_add = kvm_region_add,
    .region_del = kvm_region_del,
869 870
    .log_start = kvm_log_start,
    .log_stop = kvm_log_stop,
A
Avi Kivity 已提交
871 872 873
    .log_sync = kvm_log_sync,
    .log_global_start = kvm_log_global_start,
    .log_global_stop = kvm_log_global_stop,
874 875
    .eventfd_add = kvm_mem_ioeventfd_add,
    .eventfd_del = kvm_mem_ioeventfd_del,
876 877
    .coalesced_mmio_add = kvm_coalesce_mmio_region,
    .coalesced_mmio_del = kvm_uncoalesce_mmio_region,
878 879 880 881 882 883
    .priority = 10,
};

static MemoryListener kvm_io_listener = {
    .eventfd_add = kvm_io_ioeventfd_add,
    .eventfd_del = kvm_io_ioeventfd_del,
884
    .priority = 10,
885 886
};

887
static void kvm_handle_interrupt(CPUState *cpu, int mask)
888
{
889
    cpu->interrupt_request |= mask;
890

891
    if (!qemu_cpu_is_self(cpu)) {
892
        qemu_cpu_kick(cpu);
893 894 895
    }
}

896
int kvm_set_irq(KVMState *s, int irq, int level)
897 898 899 900
{
    struct kvm_irq_level event;
    int ret;

901
    assert(kvm_async_interrupts_enabled());
902 903 904

    event.level = level;
    event.irq = irq;
905
    ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
906
    if (ret < 0) {
907
        perror("kvm_set_irq");
908 909 910
        abort();
    }

911
    return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
912 913 914
}

#ifdef KVM_CAP_IRQ_ROUTING
915 916 917 918 919
typedef struct KVMMSIRoute {
    struct kvm_irq_routing_entry kroute;
    QTAILQ_ENTRY(KVMMSIRoute) entry;
} KVMMSIRoute;

920 921 922 923 924
static void set_gsi(KVMState *s, unsigned int gsi)
{
    s->used_gsi_bitmap[gsi / 32] |= 1U << (gsi % 32);
}

925 926 927 928 929
static void clear_gsi(KVMState *s, unsigned int gsi)
{
    s->used_gsi_bitmap[gsi / 32] &= ~(1U << (gsi % 32));
}

930 931
static void kvm_init_irq_routing(KVMState *s)
{
932
    int gsi_count, i;
933 934 935 936 937 938

    gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING);
    if (gsi_count > 0) {
        unsigned int gsi_bits, i;

        /* Round up so we can search ints using ffs */
939
        gsi_bits = ALIGN(gsi_count, 32);
940
        s->used_gsi_bitmap = g_malloc0(gsi_bits / 8);
941
        s->gsi_count = gsi_count;
942 943 944 945 946 947 948 949 950 951

        /* Mark any over-allocated bits as already in use */
        for (i = gsi_count; i < gsi_bits; i++) {
            set_gsi(s, i);
        }
    }

    s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
    s->nr_allocated_irq_routes = 0;

952 953 954 955
    if (!s->direct_msi) {
        for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
            QTAILQ_INIT(&s->msi_hashtab[i]);
        }
956 957
    }

958 959 960
    kvm_arch_init_irq_routing(s);
}

961 962 963 964 965 966 967 968 969
static void kvm_irqchip_commit_routes(KVMState *s)
{
    int ret;

    s->irq_routes->flags = 0;
    ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
    assert(ret == 0);
}

970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994
static void kvm_add_routing_entry(KVMState *s,
                                  struct kvm_irq_routing_entry *entry)
{
    struct kvm_irq_routing_entry *new;
    int n, size;

    if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
        n = s->nr_allocated_irq_routes * 2;
        if (n < 64) {
            n = 64;
        }
        size = sizeof(struct kvm_irq_routing);
        size += n * sizeof(*new);
        s->irq_routes = g_realloc(s->irq_routes, size);
        s->nr_allocated_irq_routes = n;
    }
    n = s->irq_routes->nr++;
    new = &s->irq_routes->entries[n];
    memset(new, 0, sizeof(*new));
    new->gsi = entry->gsi;
    new->type = entry->type;
    new->flags = entry->flags;
    new->u = entry->u;

    set_gsi(s, entry->gsi);
995 996

    kvm_irqchip_commit_routes(s);
997 998
}

999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022
static int kvm_update_routing_entry(KVMState *s,
                                    struct kvm_irq_routing_entry *new_entry)
{
    struct kvm_irq_routing_entry *entry;
    int n;

    for (n = 0; n < s->irq_routes->nr; n++) {
        entry = &s->irq_routes->entries[n];
        if (entry->gsi != new_entry->gsi) {
            continue;
        }

        entry->type = new_entry->type;
        entry->flags = new_entry->flags;
        entry->u = new_entry->u;

        kvm_irqchip_commit_routes(s);

        return 0;
    }

    return -ESRCH;
}

1023
void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1024 1025 1026
{
    struct kvm_irq_routing_entry e;

1027 1028
    assert(pin < s->gsi_count);

1029 1030 1031 1032 1033 1034 1035 1036
    e.gsi = irq;
    e.type = KVM_IRQ_ROUTING_IRQCHIP;
    e.flags = 0;
    e.u.irqchip.irqchip = irqchip;
    e.u.irqchip.pin = pin;
    kvm_add_routing_entry(s, &e);
}

1037
void kvm_irqchip_release_virq(KVMState *s, int virq)
1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089
{
    struct kvm_irq_routing_entry *e;
    int i;

    for (i = 0; i < s->irq_routes->nr; i++) {
        e = &s->irq_routes->entries[i];
        if (e->gsi == virq) {
            s->irq_routes->nr--;
            *e = s->irq_routes->entries[s->irq_routes->nr];
        }
    }
    clear_gsi(s, virq);
}

static unsigned int kvm_hash_msi(uint32_t data)
{
    /* This is optimized for IA32 MSI layout. However, no other arch shall
     * repeat the mistake of not providing a direct MSI injection API. */
    return data & 0xff;
}

static void kvm_flush_dynamic_msi_routes(KVMState *s)
{
    KVMMSIRoute *route, *next;
    unsigned int hash;

    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
            kvm_irqchip_release_virq(s, route->kroute.gsi);
            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
            g_free(route);
        }
    }
}

static int kvm_irqchip_get_virq(KVMState *s)
{
    uint32_t *word = s->used_gsi_bitmap;
    int max_words = ALIGN(s->gsi_count, 32) / 32;
    int i, bit;
    bool retry = true;

again:
    /* Return the lowest unused GSI in the bitmap */
    for (i = 0; i < max_words; i++) {
        bit = ffs(~word[i]);
        if (!bit) {
            continue;
        }

        return bit - 1 + i * 32;
    }
1090
    if (!s->direct_msi && retry) {
1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
        retry = false;
        kvm_flush_dynamic_msi_routes(s);
        goto again;
    }
    return -ENOSPC;

}

static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
{
    unsigned int hash = kvm_hash_msi(msg.data);
    KVMMSIRoute *route;

    QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
        if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
            route->kroute.u.msi.address_hi == (msg.address >> 32) &&
            route->kroute.u.msi.data == msg.data) {
            return route;
        }
    }
    return NULL;
}

int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
{
1116
    struct kvm_msi msi;
1117 1118
    KVMMSIRoute *route;

1119 1120 1121 1122 1123 1124 1125 1126 1127 1128
    if (s->direct_msi) {
        msi.address_lo = (uint32_t)msg.address;
        msi.address_hi = msg.address >> 32;
        msi.data = msg.data;
        msi.flags = 0;
        memset(msi.pad, 0, sizeof(msi.pad));

        return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
    }

1129 1130
    route = kvm_lookup_msi_route(s, msg);
    if (!route) {
1131
        int virq;
1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153

        virq = kvm_irqchip_get_virq(s);
        if (virq < 0) {
            return virq;
        }

        route = g_malloc(sizeof(KVMMSIRoute));
        route->kroute.gsi = virq;
        route->kroute.type = KVM_IRQ_ROUTING_MSI;
        route->kroute.flags = 0;
        route->kroute.u.msi.address_lo = (uint32_t)msg.address;
        route->kroute.u.msi.address_hi = msg.address >> 32;
        route->kroute.u.msi.data = msg.data;

        kvm_add_routing_entry(s, &route->kroute);

        QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
                           entry);
    }

    assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);

1154
    return kvm_set_irq(s, route->kroute.gsi, 1);
1155 1156
}

1157 1158 1159 1160 1161
int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
{
    struct kvm_irq_routing_entry kroute;
    int virq;

1162
    if (!kvm_gsi_routing_enabled()) {
1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182
        return -ENOSYS;
    }

    virq = kvm_irqchip_get_virq(s);
    if (virq < 0) {
        return virq;
    }

    kroute.gsi = virq;
    kroute.type = KVM_IRQ_ROUTING_MSI;
    kroute.flags = 0;
    kroute.u.msi.address_lo = (uint32_t)msg.address;
    kroute.u.msi.address_hi = msg.address >> 32;
    kroute.u.msi.data = msg.data;

    kvm_add_routing_entry(s, &kroute);

    return virq;
}

1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200
int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
{
    struct kvm_irq_routing_entry kroute;

    if (!kvm_irqchip_in_kernel()) {
        return -ENOSYS;
    }

    kroute.gsi = virq;
    kroute.type = KVM_IRQ_ROUTING_MSI;
    kroute.flags = 0;
    kroute.u.msi.address_lo = (uint32_t)msg.address;
    kroute.u.msi.address_hi = msg.address >> 32;
    kroute.u.msi.data = msg.data;

    return kvm_update_routing_entry(s, &kroute);
}

1201 1202 1203 1204 1205 1206 1207 1208
static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
{
    struct kvm_irqfd irqfd = {
        .fd = fd,
        .gsi = virq,
        .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
    };

1209
    if (!kvm_irqfds_enabled()) {
1210 1211 1212 1213 1214 1215
        return -ENOSYS;
    }

    return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
}

1216 1217 1218 1219 1220
#else /* !KVM_CAP_IRQ_ROUTING */

static void kvm_init_irq_routing(KVMState *s)
{
}
1221

1222 1223 1224 1225
void kvm_irqchip_release_virq(KVMState *s, int virq)
{
}

1226 1227 1228 1229
int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
{
    abort();
}
1230 1231 1232

int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
{
1233
    return -ENOSYS;
1234
}
1235 1236 1237 1238 1239

static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
{
    abort();
}
1240 1241 1242 1243 1244

int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
{
    return -ENOSYS;
}
1245 1246
#endif /* !KVM_CAP_IRQ_ROUTING */

J
Jan Kiszka 已提交
1247
int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
1248
{
J
Jan Kiszka 已提交
1249
    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), virq, true);
1250 1251
}

J
Jan Kiszka 已提交
1252
int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
1253
{
J
Jan Kiszka 已提交
1254
    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), virq, false);
1255 1256
}

1257 1258 1259 1260 1261 1262 1263
static int kvm_irqchip_create(KVMState *s)
{
    QemuOptsList *list = qemu_find_opts("machine");
    int ret;

    if (QTAILQ_EMPTY(&list->head) ||
        !qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
1264
                           "kernel_irqchip", true) ||
1265 1266 1267 1268 1269 1270 1271 1272 1273 1274
        !kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
        return 0;
    }

    ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
    if (ret < 0) {
        fprintf(stderr, "Create kernel irqchip failed\n");
        return ret;
    }

1275
    kvm_kernel_irqchip = true;
1276 1277 1278 1279
    /* If we have an in-kernel IRQ chip then we must have asynchronous
     * interrupt delivery (though the reverse is not necessarily true)
     */
    kvm_async_interrupts_allowed = true;
1280 1281 1282 1283 1284 1285

    kvm_init_irq_routing(s);

    return 0;
}

1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305
static int kvm_max_vcpus(KVMState *s)
{
    int ret;

    /* Find number of supported CPUs using the recommended
     * procedure from the kernel API documentation to cope with
     * older kernels that may be missing capabilities.
     */
    ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
    if (ret) {
        return ret;
    }
    ret = kvm_check_extension(s, KVM_CAP_NR_VCPUS);
    if (ret) {
        return ret;
    }

    return 4;
}

1306
int kvm_init(void)
A
aliguori 已提交
1307
{
1308 1309 1310
    static const char upgrade_note[] =
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
        "(see http://sourceforge.net/projects/kvm).\n";
A
aliguori 已提交
1311
    KVMState *s;
1312
    const KVMCapabilityInfo *missing_cap;
A
aliguori 已提交
1313 1314
    int ret;
    int i;
1315
    int max_vcpus;
A
aliguori 已提交
1316

1317
    s = g_malloc0(sizeof(KVMState));
A
aliguori 已提交
1318

1319 1320 1321 1322 1323 1324 1325 1326
    /*
     * On systems where the kernel can support different base page
     * sizes, host page size may be different from TARGET_PAGE_SIZE,
     * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
     * page size for the system though.
     */
    assert(TARGET_PAGE_SIZE <= getpagesize());

1327
#ifdef KVM_CAP_SET_GUEST_DEBUG
B
Blue Swirl 已提交
1328
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
1329
#endif
J
Jan Kiszka 已提交
1330
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
A
aliguori 已提交
1331
        s->slots[i].slot = i;
J
Jan Kiszka 已提交
1332
    }
A
aliguori 已提交
1333
    s->vmfd = -1;
K
Kevin Wolf 已提交
1334
    s->fd = qemu_open("/dev/kvm", O_RDWR);
A
aliguori 已提交
1335 1336 1337 1338 1339 1340 1341 1342
    if (s->fd == -1) {
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
        ret = -errno;
        goto err;
    }

    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
    if (ret < KVM_API_VERSION) {
J
Jan Kiszka 已提交
1343
        if (ret > 0) {
A
aliguori 已提交
1344
            ret = -EINVAL;
J
Jan Kiszka 已提交
1345
        }
A
aliguori 已提交
1346 1347 1348 1349 1350 1351 1352 1353 1354 1355
        fprintf(stderr, "kvm version too old\n");
        goto err;
    }

    if (ret > KVM_API_VERSION) {
        ret = -EINVAL;
        fprintf(stderr, "kvm version not supported\n");
        goto err;
    }

1356 1357 1358 1359 1360 1361 1362 1363
    max_vcpus = kvm_max_vcpus(s);
    if (smp_cpus > max_vcpus) {
        ret = -EINVAL;
        fprintf(stderr, "Number of SMP cpus requested (%d) exceeds max cpus "
                "supported by KVM (%d)\n", smp_cpus, max_vcpus);
        goto err;
    }

A
aliguori 已提交
1364
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
1365 1366 1367 1368 1369
    if (s->vmfd < 0) {
#ifdef TARGET_S390X
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
                        "your host kernel command line\n");
#endif
1370
        ret = s->vmfd;
A
aliguori 已提交
1371
        goto err;
1372
    }
A
aliguori 已提交
1373

1374 1375 1376 1377
    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
    if (!missing_cap) {
        missing_cap =
            kvm_check_extension_list(s, kvm_arch_required_capabilities);
A
aliguori 已提交
1378
    }
1379
    if (missing_cap) {
1380
        ret = -EINVAL;
1381 1382
        fprintf(stderr, "kvm does not support %s\n%s",
                missing_cap->name, upgrade_note);
1383 1384 1385
        goto err;
    }

1386
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
A
aliguori 已提交
1387

1388
    s->broken_set_mem_region = 1;
1389
    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
1390 1391 1392 1393
    if (ret > 0) {
        s->broken_set_mem_region = 0;
    }

1394 1395 1396 1397
#ifdef KVM_CAP_VCPU_EVENTS
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
#endif

1398 1399 1400
    s->robust_singlestep =
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);

1401 1402 1403 1404
#ifdef KVM_CAP_DEBUGREGS
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
#endif

1405 1406 1407 1408 1409 1410 1411 1412
#ifdef KVM_CAP_XSAVE
    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
#endif

#ifdef KVM_CAP_XCRS
    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
#endif

J
Jan Kiszka 已提交
1413 1414 1415 1416
#ifdef KVM_CAP_PIT_STATE2
    s->pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
#endif

1417
#ifdef KVM_CAP_IRQ_ROUTING
1418
    s->direct_msi = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
1419
#endif
1420

1421 1422
    s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);

1423
    s->irq_set_ioctl = KVM_IRQ_LINE;
1424
    if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
1425
        s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
1426 1427
    }

1428
    ret = kvm_arch_init(s);
J
Jan Kiszka 已提交
1429
    if (ret < 0) {
A
aliguori 已提交
1430
        goto err;
J
Jan Kiszka 已提交
1431
    }
A
aliguori 已提交
1432

1433 1434 1435 1436 1437
    ret = kvm_irqchip_create(s);
    if (ret < 0) {
        goto err;
    }

A
aliguori 已提交
1438
    kvm_state = s;
1439 1440
    memory_listener_register(&kvm_memory_listener, &address_space_memory);
    memory_listener_register(&kvm_io_listener, &address_space_io);
A
aliguori 已提交
1441

1442 1443
    s->many_ioeventfds = kvm_check_many_ioeventfds();

1444 1445
    cpu_interrupt_handler = kvm_handle_interrupt;

A
aliguori 已提交
1446 1447 1448
    return 0;

err:
1449 1450 1451 1452 1453
    if (s->vmfd >= 0) {
        close(s->vmfd);
    }
    if (s->fd != -1) {
        close(s->fd);
A
aliguori 已提交
1454
    }
1455
    g_free(s);
A
aliguori 已提交
1456 1457 1458 1459

    return ret;
}

1460 1461
static void kvm_handle_io(uint16_t port, void *data, int direction, int size,
                          uint32_t count)
A
aliguori 已提交
1462 1463 1464 1465 1466 1467 1468 1469
{
    int i;
    uint8_t *ptr = data;

    for (i = 0; i < count; i++) {
        if (direction == KVM_EXIT_IO_IN) {
            switch (size) {
            case 1:
1470
                stb_p(ptr, cpu_inb(port));
A
aliguori 已提交
1471 1472
                break;
            case 2:
1473
                stw_p(ptr, cpu_inw(port));
A
aliguori 已提交
1474 1475
                break;
            case 4:
1476
                stl_p(ptr, cpu_inl(port));
A
aliguori 已提交
1477 1478 1479 1480 1481
                break;
            }
        } else {
            switch (size) {
            case 1:
1482
                cpu_outb(port, ldub_p(ptr));
A
aliguori 已提交
1483 1484
                break;
            case 2:
1485
                cpu_outw(port, lduw_p(ptr));
A
aliguori 已提交
1486 1487
                break;
            case 4:
1488
                cpu_outl(port, ldl_p(ptr));
A
aliguori 已提交
1489 1490 1491 1492 1493 1494 1495 1496
                break;
            }
        }

        ptr += size;
    }
}

1497
static int kvm_handle_internal_error(CPUArchState *env, struct kvm_run *run)
M
Marcelo Tosatti 已提交
1498
{
A
Andreas Färber 已提交
1499 1500
    CPUState *cpu = ENV_GET_CPU(env);

1501
    fprintf(stderr, "KVM internal error.");
M
Marcelo Tosatti 已提交
1502 1503 1504
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
        int i;

1505
        fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
M
Marcelo Tosatti 已提交
1506 1507 1508 1509
        for (i = 0; i < run->internal.ndata; ++i) {
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
                    i, (uint64_t)run->internal.data[i]);
        }
1510 1511
    } else {
        fprintf(stderr, "\n");
M
Marcelo Tosatti 已提交
1512 1513 1514
    }
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
        fprintf(stderr, "emulation failure\n");
A
Andreas Färber 已提交
1515
        if (!kvm_arch_stop_on_emulation_error(cpu)) {
1516
            cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
1517
            return EXCP_INTERRUPT;
J
Jan Kiszka 已提交
1518
        }
M
Marcelo Tosatti 已提交
1519 1520 1521 1522
    }
    /* FIXME: Should trigger a qmp message to let management know
     * something went wrong.
     */
J
Jan Kiszka 已提交
1523
    return -1;
M
Marcelo Tosatti 已提交
1524 1525
}

1526
void kvm_flush_coalesced_mmio_buffer(void)
A
aliguori 已提交
1527 1528
{
    KVMState *s = kvm_state;
1529 1530 1531 1532 1533 1534 1535

    if (s->coalesced_flush_in_progress) {
        return;
    }

    s->coalesced_flush_in_progress = true;

1536 1537
    if (s->coalesced_mmio_ring) {
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
A
aliguori 已提交
1538 1539 1540 1541 1542 1543
        while (ring->first != ring->last) {
            struct kvm_coalesced_mmio *ent;

            ent = &ring->coalesced_mmio[ring->first];

            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
1544
            smp_wmb();
A
aliguori 已提交
1545 1546 1547
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
        }
    }
1548 1549

    s->coalesced_flush_in_progress = false;
A
aliguori 已提交
1550 1551
}

A
Andreas Färber 已提交
1552
static void do_kvm_cpu_synchronize_state(void *arg)
1553
{
A
Andreas Färber 已提交
1554
    CPUState *cpu = arg;
1555

A
Andreas Färber 已提交
1556 1557 1558
    if (!cpu->kvm_vcpu_dirty) {
        kvm_arch_get_registers(cpu);
        cpu->kvm_vcpu_dirty = true;
1559 1560 1561
    }
}

1562
void kvm_cpu_synchronize_state(CPUArchState *env)
1563
{
1564 1565
    CPUState *cpu = ENV_GET_CPU(env);

A
Andreas Färber 已提交
1566 1567
    if (!cpu->kvm_vcpu_dirty) {
        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu);
J
Jan Kiszka 已提交
1568
    }
1569 1570
}

1571
void kvm_cpu_synchronize_post_reset(CPUState *cpu)
1572
{
A
Andreas Färber 已提交
1573 1574
    kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
    cpu->kvm_vcpu_dirty = false;
1575 1576
}

1577
void kvm_cpu_synchronize_post_init(CPUState *cpu)
1578
{
A
Andreas Färber 已提交
1579 1580
    kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
    cpu->kvm_vcpu_dirty = false;
1581 1582
}

1583
int kvm_cpu_exec(CPUArchState *env)
A
aliguori 已提交
1584
{
A
Andreas Färber 已提交
1585
    CPUState *cpu = ENV_GET_CPU(env);
A
Andreas Färber 已提交
1586
    struct kvm_run *run = cpu->kvm_run;
1587
    int ret, run_ret;
A
aliguori 已提交
1588

1589
    DPRINTF("kvm_cpu_exec()\n");
A
aliguori 已提交
1590

A
Andreas Färber 已提交
1591
    if (kvm_arch_process_async_events(cpu)) {
1592
        cpu->exit_request = 0;
1593
        return EXCP_HLT;
1594
    }
M
Marcelo Tosatti 已提交
1595

1596
    do {
A
Andreas Färber 已提交
1597 1598 1599
        if (cpu->kvm_vcpu_dirty) {
            kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
            cpu->kvm_vcpu_dirty = false;
1600 1601
        }

A
Andreas Färber 已提交
1602
        kvm_arch_pre_run(cpu, run);
1603
        if (cpu->exit_request) {
1604 1605 1606 1607 1608 1609 1610 1611
            DPRINTF("interrupt exit requested\n");
            /*
             * KVM requires us to reenter the kernel after IO exits to complete
             * instruction emulation. This self-signal will ensure that we
             * leave ASAP again.
             */
            qemu_cpu_kick_self();
        }
1612
        qemu_mutex_unlock_iothread();
1613

1614
        run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
1615

1616
        qemu_mutex_lock_iothread();
A
Andreas Färber 已提交
1617
        kvm_arch_post_run(cpu, run);
A
aliguori 已提交
1618

1619
        if (run_ret < 0) {
1620 1621
            if (run_ret == -EINTR || run_ret == -EAGAIN) {
                DPRINTF("io window exit\n");
1622
                ret = EXCP_INTERRUPT;
1623 1624
                break;
            }
1625 1626
            fprintf(stderr, "error: kvm run failed %s\n",
                    strerror(-run_ret));
A
aliguori 已提交
1627 1628 1629
            abort();
        }

1630
        trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
A
aliguori 已提交
1631 1632
        switch (run->exit_reason) {
        case KVM_EXIT_IO:
1633
            DPRINTF("handle_io\n");
1634 1635 1636 1637 1638
            kvm_handle_io(run->io.port,
                          (uint8_t *)run + run->io.data_offset,
                          run->io.direction,
                          run->io.size,
                          run->io.count);
1639
            ret = 0;
A
aliguori 已提交
1640 1641
            break;
        case KVM_EXIT_MMIO:
1642
            DPRINTF("handle_mmio\n");
A
aliguori 已提交
1643 1644 1645 1646
            cpu_physical_memory_rw(run->mmio.phys_addr,
                                   run->mmio.data,
                                   run->mmio.len,
                                   run->mmio.is_write);
1647
            ret = 0;
A
aliguori 已提交
1648 1649
            break;
        case KVM_EXIT_IRQ_WINDOW_OPEN:
1650
            DPRINTF("irq_window_open\n");
1651
            ret = EXCP_INTERRUPT;
A
aliguori 已提交
1652 1653
            break;
        case KVM_EXIT_SHUTDOWN:
1654
            DPRINTF("shutdown\n");
A
aliguori 已提交
1655
            qemu_system_reset_request();
1656
            ret = EXCP_INTERRUPT;
A
aliguori 已提交
1657 1658
            break;
        case KVM_EXIT_UNKNOWN:
1659 1660
            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
                    (uint64_t)run->hw.hardware_exit_reason);
J
Jan Kiszka 已提交
1661
            ret = -1;
A
aliguori 已提交
1662
            break;
M
Marcelo Tosatti 已提交
1663
        case KVM_EXIT_INTERNAL_ERROR:
J
Jan Kiszka 已提交
1664
            ret = kvm_handle_internal_error(env, run);
M
Marcelo Tosatti 已提交
1665
            break;
A
aliguori 已提交
1666
        default:
1667
            DPRINTF("kvm_arch_handle_exit\n");
A
Andreas Färber 已提交
1668
            ret = kvm_arch_handle_exit(cpu, run);
A
aliguori 已提交
1669 1670
            break;
        }
1671
    } while (ret == 0);
A
aliguori 已提交
1672

J
Jan Kiszka 已提交
1673
    if (ret < 0) {
1674
        cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
1675
        vm_stop(RUN_STATE_INTERNAL_ERROR);
A
aliguori 已提交
1676 1677
    }

1678
    cpu->exit_request = 0;
A
aliguori 已提交
1679 1680 1681
    return ret;
}

1682
int kvm_ioctl(KVMState *s, int type, ...)
A
aliguori 已提交
1683 1684
{
    int ret;
1685 1686
    void *arg;
    va_list ap;
A
aliguori 已提交
1687

1688 1689 1690 1691
    va_start(ap, type);
    arg = va_arg(ap, void *);
    va_end(ap);

1692
    trace_kvm_ioctl(type, arg);
1693
    ret = ioctl(s->fd, type, arg);
J
Jan Kiszka 已提交
1694
    if (ret == -1) {
A
aliguori 已提交
1695
        ret = -errno;
J
Jan Kiszka 已提交
1696
    }
A
aliguori 已提交
1697 1698 1699
    return ret;
}

1700
int kvm_vm_ioctl(KVMState *s, int type, ...)
A
aliguori 已提交
1701 1702
{
    int ret;
1703 1704 1705 1706 1707 1708
    void *arg;
    va_list ap;

    va_start(ap, type);
    arg = va_arg(ap, void *);
    va_end(ap);
A
aliguori 已提交
1709

1710
    trace_kvm_vm_ioctl(type, arg);
1711
    ret = ioctl(s->vmfd, type, arg);
J
Jan Kiszka 已提交
1712
    if (ret == -1) {
A
aliguori 已提交
1713
        ret = -errno;
J
Jan Kiszka 已提交
1714
    }
A
aliguori 已提交
1715 1716 1717
    return ret;
}

1718
int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
A
aliguori 已提交
1719 1720
{
    int ret;
1721 1722 1723 1724 1725 1726
    void *arg;
    va_list ap;

    va_start(ap, type);
    arg = va_arg(ap, void *);
    va_end(ap);
A
aliguori 已提交
1727

1728
    trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
A
Andreas Färber 已提交
1729
    ret = ioctl(cpu->kvm_fd, type, arg);
J
Jan Kiszka 已提交
1730
    if (ret == -1) {
A
aliguori 已提交
1731
        ret = -errno;
J
Jan Kiszka 已提交
1732
    }
A
aliguori 已提交
1733 1734
    return ret;
}
A
aliguori 已提交
1735 1736 1737

int kvm_has_sync_mmu(void)
{
1738
    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
A
aliguori 已提交
1739
}
1740

1741 1742 1743 1744 1745
int kvm_has_vcpu_events(void)
{
    return kvm_state->vcpu_events;
}

1746 1747 1748 1749 1750
int kvm_has_robust_singlestep(void)
{
    return kvm_state->robust_singlestep;
}

1751 1752 1753 1754 1755
int kvm_has_debugregs(void)
{
    return kvm_state->debugregs;
}

1756 1757 1758 1759 1760 1761 1762 1763 1764 1765
int kvm_has_xsave(void)
{
    return kvm_state->xsave;
}

int kvm_has_xcrs(void)
{
    return kvm_state->xcrs;
}

J
Jan Kiszka 已提交
1766 1767 1768 1769 1770
int kvm_has_pit_state2(void)
{
    return kvm_state->pit_state2;
}

1771 1772 1773 1774 1775 1776 1777 1778
int kvm_has_many_ioeventfds(void)
{
    if (!kvm_enabled()) {
        return 0;
    }
    return kvm_state->many_ioeventfds;
}

1779 1780
int kvm_has_gsi_routing(void)
{
A
Alexander Graf 已提交
1781
#ifdef KVM_CAP_IRQ_ROUTING
1782
    return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
A
Alexander Graf 已提交
1783 1784 1785
#else
    return false;
#endif
1786 1787
}

1788 1789 1790 1791 1792
int kvm_has_intx_set_mask(void)
{
    return kvm_state->intx_set_mask;
}

1793
void *kvm_ram_alloc(ram_addr_t size)
1794 1795 1796 1797
{
#ifdef TARGET_S390X
    void *mem;

1798
    mem = kvm_arch_ram_alloc(size);
1799 1800 1801 1802
    if (mem) {
        return mem;
    }
#endif
1803
    return qemu_anon_ram_alloc(size);
1804 1805
}

1806 1807
void kvm_setup_guest_memory(void *start, size_t size)
{
1808 1809 1810
#ifdef CONFIG_VALGRIND_H
    VALGRIND_MAKE_MEM_DEFINED(start, size);
#endif
1811
    if (!kvm_has_sync_mmu()) {
A
Andreas Färber 已提交
1812
        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1813 1814

        if (ret) {
A
Andreas Färber 已提交
1815 1816 1817
            perror("qemu_madvise");
            fprintf(stderr,
                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1818 1819 1820 1821 1822
            exit(1);
        }
    }
}

1823
#ifdef KVM_CAP_SET_GUEST_DEBUG
1824
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
1825 1826 1827 1828
                                                 target_ulong pc)
{
    struct kvm_sw_breakpoint *bp;

1829
    QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
J
Jan Kiszka 已提交
1830
        if (bp->pc == pc) {
1831
            return bp;
J
Jan Kiszka 已提交
1832
        }
1833 1834 1835 1836
    }
    return NULL;
}

1837
int kvm_sw_breakpoints_active(CPUState *cpu)
1838
{
1839
    return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
1840 1841
}

G
Glauber Costa 已提交
1842 1843
struct kvm_set_guest_debug_data {
    struct kvm_guest_debug dbg;
1844
    CPUState *cpu;
G
Glauber Costa 已提交
1845 1846 1847 1848 1849 1850
    int err;
};

static void kvm_invoke_set_guest_debug(void *data)
{
    struct kvm_set_guest_debug_data *dbg_data = data;
J
Jan Kiszka 已提交
1851

1852 1853
    dbg_data->err = kvm_vcpu_ioctl(dbg_data->cpu, KVM_SET_GUEST_DEBUG,
                                   &dbg_data->dbg);
G
Glauber Costa 已提交
1854 1855
}

1856
int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap)
1857
{
1858
    CPUState *cpu = ENV_GET_CPU(env);
G
Glauber Costa 已提交
1859
    struct kvm_set_guest_debug_data data;
1860

1861
    data.dbg.control = reinject_trap;
1862

1863 1864 1865
    if (env->singlestep_enabled) {
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
    }
A
Andreas Färber 已提交
1866
    kvm_arch_update_guest_debug(cpu, &data.dbg);
1867
    data.cpu = cpu;
1868

1869
    run_on_cpu(cpu, kvm_invoke_set_guest_debug, &data);
G
Glauber Costa 已提交
1870
    return data.err;
1871 1872
}

1873
int kvm_insert_breakpoint(CPUArchState *current_env, target_ulong addr,
1874 1875
                          target_ulong len, int type)
{
A
Andreas Färber 已提交
1876
    CPUState *current_cpu = ENV_GET_CPU(current_env);
1877
    struct kvm_sw_breakpoint *bp;
1878
    CPUArchState *env;
1879 1880 1881
    int err;

    if (type == GDB_BREAKPOINT_SW) {
1882
        bp = kvm_find_sw_breakpoint(current_cpu, addr);
1883 1884 1885 1886 1887
        if (bp) {
            bp->use_count++;
            return 0;
        }

1888
        bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
J
Jan Kiszka 已提交
1889
        if (!bp) {
1890
            return -ENOMEM;
J
Jan Kiszka 已提交
1891
        }
1892 1893 1894

        bp->pc = addr;
        bp->use_count = 1;
A
Andreas Färber 已提交
1895
        err = kvm_arch_insert_sw_breakpoint(current_cpu, bp);
1896
        if (err) {
1897
            g_free(bp);
1898 1899 1900
            return err;
        }

1901
        QTAILQ_INSERT_HEAD(&current_cpu->kvm_state->kvm_sw_breakpoints,
1902 1903 1904
                          bp, entry);
    } else {
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
J
Jan Kiszka 已提交
1905
        if (err) {
1906
            return err;
J
Jan Kiszka 已提交
1907
        }
1908 1909 1910 1911
    }

    for (env = first_cpu; env != NULL; env = env->next_cpu) {
        err = kvm_update_guest_debug(env, 0);
J
Jan Kiszka 已提交
1912
        if (err) {
1913
            return err;
J
Jan Kiszka 已提交
1914
        }
1915 1916 1917 1918
    }
    return 0;
}

1919
int kvm_remove_breakpoint(CPUArchState *current_env, target_ulong addr,
1920 1921
                          target_ulong len, int type)
{
A
Andreas Färber 已提交
1922
    CPUState *current_cpu = ENV_GET_CPU(current_env);
1923
    struct kvm_sw_breakpoint *bp;
1924
    CPUArchState *env;
1925 1926 1927
    int err;

    if (type == GDB_BREAKPOINT_SW) {
1928
        bp = kvm_find_sw_breakpoint(current_cpu, addr);
J
Jan Kiszka 已提交
1929
        if (!bp) {
1930
            return -ENOENT;
J
Jan Kiszka 已提交
1931
        }
1932 1933 1934 1935 1936 1937

        if (bp->use_count > 1) {
            bp->use_count--;
            return 0;
        }

A
Andreas Färber 已提交
1938
        err = kvm_arch_remove_sw_breakpoint(current_cpu, bp);
J
Jan Kiszka 已提交
1939
        if (err) {
1940
            return err;
J
Jan Kiszka 已提交
1941
        }
1942

1943
        QTAILQ_REMOVE(&current_cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
1944
        g_free(bp);
1945 1946
    } else {
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
J
Jan Kiszka 已提交
1947
        if (err) {
1948
            return err;
J
Jan Kiszka 已提交
1949
        }
1950 1951 1952 1953
    }

    for (env = first_cpu; env != NULL; env = env->next_cpu) {
        err = kvm_update_guest_debug(env, 0);
J
Jan Kiszka 已提交
1954
        if (err) {
1955
            return err;
J
Jan Kiszka 已提交
1956
        }
1957 1958 1959 1960
    }
    return 0;
}

1961
void kvm_remove_all_breakpoints(CPUArchState *current_env)
1962
{
A
Andreas Färber 已提交
1963
    CPUState *current_cpu = ENV_GET_CPU(current_env);
1964
    struct kvm_sw_breakpoint *bp, *next;
1965
    KVMState *s = current_cpu->kvm_state;
1966
    CPUArchState *env;
A
Andreas Färber 已提交
1967
    CPUState *cpu;
1968

B
Blue Swirl 已提交
1969
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
A
Andreas Färber 已提交
1970
        if (kvm_arch_remove_sw_breakpoint(current_cpu, bp) != 0) {
1971 1972
            /* Try harder to find a CPU that currently sees the breakpoint. */
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
A
Andreas Färber 已提交
1973 1974
                cpu = ENV_GET_CPU(env);
                if (kvm_arch_remove_sw_breakpoint(cpu, bp) == 0) {
1975
                    break;
J
Jan Kiszka 已提交
1976
                }
1977 1978
            }
        }
1979 1980
        QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
        g_free(bp);
1981 1982 1983
    }
    kvm_arch_remove_all_hw_breakpoints();

J
Jan Kiszka 已提交
1984
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1985
        kvm_update_guest_debug(env, 0);
J
Jan Kiszka 已提交
1986
    }
1987 1988 1989 1990
}

#else /* !KVM_CAP_SET_GUEST_DEBUG */

1991
int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap)
1992 1993 1994 1995
{
    return -EINVAL;
}

1996
int kvm_insert_breakpoint(CPUArchState *current_env, target_ulong addr,
1997 1998 1999 2000 2001
                          target_ulong len, int type)
{
    return -EINVAL;
}

2002
int kvm_remove_breakpoint(CPUArchState *current_env, target_ulong addr,
2003 2004 2005 2006 2007
                          target_ulong len, int type)
{
    return -EINVAL;
}

2008
void kvm_remove_all_breakpoints(CPUArchState *current_env)
2009 2010 2011
{
}
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
2012

2013
int kvm_set_signal_mask(CPUArchState *env, const sigset_t *sigset)
2014
{
2015
    CPUState *cpu = ENV_GET_CPU(env);
2016 2017 2018
    struct kvm_signal_mask *sigmask;
    int r;

J
Jan Kiszka 已提交
2019
    if (!sigset) {
2020
        return kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, NULL);
J
Jan Kiszka 已提交
2021
    }
2022

2023
    sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
2024 2025 2026

    sigmask->len = 8;
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
2027
    r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
2028
    g_free(sigmask);
2029 2030 2031

    return r;
}
2032
int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
2033
{
A
Andreas Färber 已提交
2034
    return kvm_arch_on_sigbus_vcpu(cpu, code, addr);
2035 2036 2037 2038 2039 2040
}

int kvm_on_sigbus(int code, void *addr)
{
    return kvm_arch_on_sigbus(code, addr);
}