kvm.c 66.5 KB
Newer Older
A
aliguori 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * QEMU KVM support
 *
 * Copyright (C) 2006-2008 Qumranet Technologies
 * Copyright IBM, Corp. 2008
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 *
 */

#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
18
#include <sys/utsname.h>
A
aliguori 已提交
19 20

#include <linux/kvm.h>
J
Jan Kiszka 已提交
21
#include <linux/kvm_para.h>
A
aliguori 已提交
22 23

#include "qemu-common.h"
24 25
#include "sysemu/sysemu.h"
#include "sysemu/kvm.h"
26
#include "kvm_i386.h"
A
aliguori 已提交
27
#include "cpu.h"
28
#include "exec/gdbstub.h"
29 30
#include "qemu/host-utils.h"
#include "qemu/config-file.h"
P
Paolo Bonzini 已提交
31 32
#include "hw/i386/pc.h"
#include "hw/i386/apic.h"
33
#include "exec/ioport.h"
34
#include "hyperv.h"
35
#include "hw/pci/pci.h"
A
aliguori 已提交
36 37 38 39

//#define DEBUG_KVM

#ifdef DEBUG_KVM
40
#define DPRINTF(fmt, ...) \
A
aliguori 已提交
41 42
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
#else
43
#define DPRINTF(fmt, ...) \
A
aliguori 已提交
44 45 46
    do { } while (0)
#endif

47 48 49
#define MSR_KVM_WALL_CLOCK  0x11
#define MSR_KVM_SYSTEM_TIME 0x12

M
Marcelo Tosatti 已提交
50 51 52 53 54 55 56
#ifndef BUS_MCEERR_AR
#define BUS_MCEERR_AR 4
#endif
#ifndef BUS_MCEERR_AO
#define BUS_MCEERR_AO 5
#endif

57 58 59 60 61 62
const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
    KVM_CAP_INFO(SET_TSS_ADDR),
    KVM_CAP_INFO(EXT_CPUID),
    KVM_CAP_INFO(MP_STATE),
    KVM_CAP_LAST_INFO
};
63

64 65
static bool has_msr_star;
static bool has_msr_hsave_pa;
66
static bool has_msr_tsc_adjust;
67
static bool has_msr_tsc_deadline;
68
static bool has_msr_async_pf_en;
M
Michael S. Tsirkin 已提交
69
static bool has_msr_pv_eoi_en;
A
Avi Kivity 已提交
70
static bool has_msr_misc_enable;
71
static bool has_msr_kvm_steal_time;
72
static int lm_capable_kernel;
73

74 75 76 77 78
bool kvm_allows_irq0_override(void)
{
    return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
}

79 80 81 82 83 84
static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
{
    struct kvm_cpuid2 *cpuid;
    int r, size;

    size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
85
    cpuid = (struct kvm_cpuid2 *)g_malloc0(size);
86 87
    cpuid->nent = max;
    r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
88 89 90
    if (r == 0 && cpuid->nent >= max) {
        r = -E2BIG;
    }
91 92
    if (r < 0) {
        if (r == -E2BIG) {
93
            g_free(cpuid);
94 95 96 97 98 99 100 101 102 103
            return NULL;
        } else {
            fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
                    strerror(-r));
            exit(1);
        }
    }
    return cpuid;
}

104 105 106 107 108 109 110 111 112 113 114 115 116
/* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
 * for all entries.
 */
static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
{
    struct kvm_cpuid2 *cpuid;
    int max = 1;
    while ((cpuid = try_get_cpuid(s, max)) == NULL) {
        max *= 2;
    }
    return cpuid;
}

117 118 119 120 121 122 123 124 125 126 127
struct kvm_para_features {
    int cap;
    int feature;
} para_features[] = {
    { KVM_CAP_CLOCKSOURCE, KVM_FEATURE_CLOCKSOURCE },
    { KVM_CAP_NOP_IO_DELAY, KVM_FEATURE_NOP_IO_DELAY },
    { KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP },
    { KVM_CAP_ASYNC_PF, KVM_FEATURE_ASYNC_PF },
    { -1, -1 }
};

128
static int get_para_features(KVMState *s)
129 130 131 132
{
    int i, features = 0;

    for (i = 0; i < ARRAY_SIZE(para_features) - 1; i++) {
133
        if (kvm_check_extension(s, para_features[i].cap)) {
134 135 136 137 138 139 140 141
            features |= (1 << para_features[i].feature);
        }
    }

    return features;
}


142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
/* Returns the value for a specific register on the cpuid entry
 */
static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
{
    uint32_t ret = 0;
    switch (reg) {
    case R_EAX:
        ret = entry->eax;
        break;
    case R_EBX:
        ret = entry->ebx;
        break;
    case R_ECX:
        ret = entry->ecx;
        break;
    case R_EDX:
        ret = entry->edx;
        break;
    }
    return ret;
}

164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
/* Find matching entry for function/index on kvm_cpuid2 struct
 */
static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
                                                 uint32_t function,
                                                 uint32_t index)
{
    int i;
    for (i = 0; i < cpuid->nent; ++i) {
        if (cpuid->entries[i].function == function &&
            cpuid->entries[i].index == index) {
            return &cpuid->entries[i];
        }
    }
    /* not found: */
    return NULL;
}

181
uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
182
                                      uint32_t index, int reg)
183 184 185 186
{
    struct kvm_cpuid2 *cpuid;
    uint32_t ret = 0;
    uint32_t cpuid_1_edx;
187
    bool found = false;
188

189
    cpuid = get_supported_cpuid(s);
190

191 192 193 194
    struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
    if (entry) {
        found = true;
        ret = cpuid_entry_get_reg(entry, reg);
195 196
    }

197 198
    /* Fixups for the data returned by KVM, below */

199 200 201
    if (function == 1 && reg == R_EDX) {
        /* KVM before 2.6.30 misreports the following features */
        ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
202 203 204 205 206
    } else if (function == 1 && reg == R_ECX) {
        /* We can set the hypervisor flag, even if KVM does not return it on
         * GET_SUPPORTED_CPUID
         */
        ret |= CPUID_EXT_HYPERVISOR;
207 208 209 210 211 212 213 214
        /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
         * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
         * and the irqchip is in the kernel.
         */
        if (kvm_irqchip_in_kernel() &&
                kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
            ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
        }
215 216 217 218 219 220

        /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
         * without the in-kernel irqchip
         */
        if (!kvm_irqchip_in_kernel()) {
            ret &= ~CPUID_EXT_X2APIC;
221
        }
222 223 224 225 226 227
    } else if (function == 0x80000001 && reg == R_EDX) {
        /* On Intel, kvm returns cpuid according to the Intel spec,
         * so add missing bits according to the AMD spec:
         */
        cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
        ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
228 229
    }

230
    g_free(cpuid);
231

232
    /* fallback for older kernels */
233
    if ((function == KVM_CPUID_FEATURES) && !found) {
234
        ret = get_para_features(s);
235
    }
236 237

    return ret;
G
Gleb Natapov 已提交
238 239
}

240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
typedef struct HWPoisonPage {
    ram_addr_t ram_addr;
    QLIST_ENTRY(HWPoisonPage) list;
} HWPoisonPage;

static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
    QLIST_HEAD_INITIALIZER(hwpoison_page_list);

static void kvm_unpoison_all(void *param)
{
    HWPoisonPage *page, *next_page;

    QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
        QLIST_REMOVE(page, list);
        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
255
        g_free(page);
256 257 258 259 260 261 262 263 264 265 266 267
    }
}

static void kvm_hwpoison_page_add(ram_addr_t ram_addr)
{
    HWPoisonPage *page;

    QLIST_FOREACH(page, &hwpoison_page_list, list) {
        if (page->ram_addr == ram_addr) {
            return;
        }
    }
268
    page = g_malloc(sizeof(HWPoisonPage));
269 270 271 272
    page->ram_addr = ram_addr;
    QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
}

M
Marcelo Tosatti 已提交
273 274 275 276 277
static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
                                     int *max_banks)
{
    int r;

278
    r = kvm_check_extension(s, KVM_CAP_MCE);
M
Marcelo Tosatti 已提交
279 280 281 282 283 284 285
    if (r > 0) {
        *max_banks = r;
        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
    }
    return -ENOSYS;
}

286
static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
M
Marcelo Tosatti 已提交
287
{
288
    CPUX86State *env = &cpu->env;
289 290 291
    uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
                      MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
    uint64_t mcg_status = MCG_STATUS_MCIP;
M
Marcelo Tosatti 已提交
292

293 294 295 296 297 298
    if (code == BUS_MCEERR_AR) {
        status |= MCI_STATUS_AR | 0x134;
        mcg_status |= MCG_STATUS_EIPV;
    } else {
        status |= 0xc0;
        mcg_status |= MCG_STATUS_RIPV;
299
    }
300
    cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
301 302 303
                       (MCM_ADDR_PHYS << 6) | 0xc,
                       cpu_x86_support_mca_broadcast(env) ?
                       MCE_INJECT_BROADCAST : 0);
304 305 306 307 308 309 310 311
}

static void hardware_memory_error(void)
{
    fprintf(stderr, "Hardware memory error!\n");
    exit(1);
}

A
Andreas Färber 已提交
312
int kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
313
{
A
Andreas Färber 已提交
314 315
    X86CPU *cpu = X86_CPU(c);
    CPUX86State *env = &cpu->env;
316
    ram_addr_t ram_addr;
A
Avi Kivity 已提交
317
    hwaddr paddr;
318 319

    if ((env->mcg_cap & MCG_SER_P) && addr
320
        && (code == BUS_MCEERR_AR || code == BUS_MCEERR_AO)) {
321
        if (qemu_ram_addr_from_host(addr, &ram_addr) == NULL ||
322
            !kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
323 324 325 326 327 328 329 330 331
            fprintf(stderr, "Hardware memory error for memory used by "
                    "QEMU itself instead of guest system!\n");
            /* Hope we are lucky for AO MCE */
            if (code == BUS_MCEERR_AO) {
                return 0;
            } else {
                hardware_memory_error();
            }
        }
332
        kvm_hwpoison_page_add(ram_addr);
333
        kvm_mce_inject(cpu, paddr, code);
334
    } else {
335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
        if (code == BUS_MCEERR_AO) {
            return 0;
        } else if (code == BUS_MCEERR_AR) {
            hardware_memory_error();
        } else {
            return 1;
        }
    }
    return 0;
}

int kvm_arch_on_sigbus(int code, void *addr)
{
    if ((first_cpu->mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) {
        ram_addr_t ram_addr;
A
Avi Kivity 已提交
350
        hwaddr paddr;
351 352

        /* Hope we are lucky for AO MCE */
353
        if (qemu_ram_addr_from_host(addr, &ram_addr) == NULL ||
354 355
            !kvm_physical_memory_addr_from_host(CPU(first_cpu)->kvm_state,
                                                addr, &paddr)) {
356 357 358 359
            fprintf(stderr, "Hardware memory error for memory used by "
                    "QEMU itself instead of guest system!: %p\n", addr);
            return 0;
        }
360
        kvm_hwpoison_page_add(ram_addr);
361
        kvm_mce_inject(x86_env_get_cpu(first_cpu), paddr, code);
362
    } else {
363 364 365 366 367 368 369 370 371 372
        if (code == BUS_MCEERR_AO) {
            return 0;
        } else if (code == BUS_MCEERR_AR) {
            hardware_memory_error();
        } else {
            return 1;
        }
    }
    return 0;
}
M
Marcelo Tosatti 已提交
373

374
static int kvm_inject_mce_oldstyle(X86CPU *cpu)
375
{
376 377
    CPUX86State *env = &cpu->env;

378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
    if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) {
        unsigned int bank, bank_num = env->mcg_cap & 0xff;
        struct kvm_x86_mce mce;

        env->exception_injected = -1;

        /*
         * There must be at least one bank in use if an MCE is pending.
         * Find it and use its values for the event injection.
         */
        for (bank = 0; bank < bank_num; bank++) {
            if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
                break;
            }
        }
        assert(bank < bank_num);

        mce.bank = bank;
        mce.status = env->mce_banks[bank * 4 + 1];
        mce.mcg_status = env->mcg_status;
        mce.addr = env->mce_banks[bank * 4 + 2];
        mce.misc = env->mce_banks[bank * 4 + 3];

401
        return kvm_vcpu_ioctl(CPU(cpu), KVM_X86_SET_MCE, &mce);
402 403 404 405
    }
    return 0;
}

406
static void cpu_update_state(void *opaque, int running, RunState state)
407
{
408
    CPUX86State *env = opaque;
409 410 411 412 413 414

    if (running) {
        env->tsc_valid = false;
    }
}

415
unsigned long kvm_arch_vcpu_id(CPUState *cs)
416
{
417 418
    X86CPU *cpu = X86_CPU(cs);
    return cpu->env.cpuid_apic_id;
419 420
}

421
#define KVM_MAX_CPUID_ENTRIES  100
422

A
Andreas Färber 已提交
423
int kvm_arch_init_vcpu(CPUState *cs)
A
aliguori 已提交
424 425
{
    struct {
426
        struct kvm_cpuid2 cpuid;
427
        struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
428
    } QEMU_PACKED cpuid_data;
A
Andreas Färber 已提交
429 430
    X86CPU *cpu = X86_CPU(cs);
    CPUX86State *env = &cpu->env;
431
    uint32_t limit, i, j, cpuid_i;
432
    uint32_t unused;
G
Gleb Natapov 已提交
433 434
    struct kvm_cpuid_entry2 *c;
    uint32_t signature[3];
435
    int r;
A
aliguori 已提交
436 437 438

    cpuid_i = 0;

G
Gleb Natapov 已提交
439 440 441 442
    /* Paravirtualization CPUIDs */
    c = &cpuid_data.entries[cpuid_i++];
    memset(c, 0, sizeof(*c));
    c->function = KVM_CPUID_SIGNATURE;
443 444 445 446 447 448 449
    if (!hyperv_enabled()) {
        memcpy(signature, "KVMKVMKVM\0\0\0", 12);
        c->eax = 0;
    } else {
        memcpy(signature, "Microsoft Hv", 12);
        c->eax = HYPERV_CPUID_MIN;
    }
G
Gleb Natapov 已提交
450 451 452 453 454 455 456
    c->ebx = signature[0];
    c->ecx = signature[1];
    c->edx = signature[2];

    c = &cpuid_data.entries[cpuid_i++];
    memset(c, 0, sizeof(*c));
    c->function = KVM_CPUID_FEATURES;
457
    c->eax = env->features[FEAT_KVM];
458

459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506
    if (hyperv_enabled()) {
        memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
        c->eax = signature[0];

        c = &cpuid_data.entries[cpuid_i++];
        memset(c, 0, sizeof(*c));
        c->function = HYPERV_CPUID_VERSION;
        c->eax = 0x00001bbc;
        c->ebx = 0x00060001;

        c = &cpuid_data.entries[cpuid_i++];
        memset(c, 0, sizeof(*c));
        c->function = HYPERV_CPUID_FEATURES;
        if (hyperv_relaxed_timing_enabled()) {
            c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
        }
        if (hyperv_vapic_recommended()) {
            c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
            c->eax |= HV_X64_MSR_APIC_ACCESS_AVAILABLE;
        }

        c = &cpuid_data.entries[cpuid_i++];
        memset(c, 0, sizeof(*c));
        c->function = HYPERV_CPUID_ENLIGHTMENT_INFO;
        if (hyperv_relaxed_timing_enabled()) {
            c->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
        }
        if (hyperv_vapic_recommended()) {
            c->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
        }
        c->ebx = hyperv_get_spinlock_retries();

        c = &cpuid_data.entries[cpuid_i++];
        memset(c, 0, sizeof(*c));
        c->function = HYPERV_CPUID_IMPLEMENT_LIMITS;
        c->eax = 0x40;
        c->ebx = 0x40;

        c = &cpuid_data.entries[cpuid_i++];
        memset(c, 0, sizeof(*c));
        c->function = KVM_CPUID_SIGNATURE_NEXT;
        memcpy(signature, "KVMKVMKVM\0\0\0", 12);
        c->eax = 0;
        c->ebx = signature[0];
        c->ecx = signature[1];
        c->edx = signature[2];
    }

507
    has_msr_async_pf_en = c->eax & (1 << KVM_FEATURE_ASYNC_PF);
G
Gleb Natapov 已提交
508

M
Michael S. Tsirkin 已提交
509 510
    has_msr_pv_eoi_en = c->eax & (1 << KVM_FEATURE_PV_EOI);

511 512
    has_msr_kvm_steal_time = c->eax & (1 << KVM_FEATURE_STEAL_TIME);

513
    cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
A
aliguori 已提交
514 515

    for (i = 0; i <= limit; i++) {
516 517 518 519
        if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
            fprintf(stderr, "unsupported level value: 0x%x\n", limit);
            abort();
        }
G
Gleb Natapov 已提交
520
        c = &cpuid_data.entries[cpuid_i++];
521 522

        switch (i) {
523 524 525 526 527
        case 2: {
            /* Keep reading function 2 till all the input is received */
            int times;

            c->function = i;
528 529 530 531
            c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
                       KVM_CPUID_FLAG_STATE_READ_NEXT;
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
            times = c->eax & 0xff;
532 533

            for (j = 1; j < times; ++j) {
534 535 536 537 538
                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
                    fprintf(stderr, "cpuid_data is full, no space for "
                            "cpuid(eax:2):eax & 0xf = 0x%x\n", times);
                    abort();
                }
539
                c = &cpuid_data.entries[cpuid_i++];
540
                c->function = i;
541 542
                c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
                cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
543 544 545
            }
            break;
        }
546 547 548 549
        case 4:
        case 0xb:
        case 0xd:
            for (j = 0; ; j++) {
550 551 552
                if (i == 0xd && j == 64) {
                    break;
                }
553 554 555
                c->function = i;
                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                c->index = j;
556
                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
557

558
                if (i == 4 && c->eax == 0) {
559
                    break;
560 561
                }
                if (i == 0xb && !(c->ecx & 0xff00)) {
562
                    break;
563 564
                }
                if (i == 0xd && c->eax == 0) {
565
                    continue;
566
                }
567 568 569 570 571
                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
                    fprintf(stderr, "cpuid_data is full, no space for "
                            "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
                    abort();
                }
572
                c = &cpuid_data.entries[cpuid_i++];
573 574 575 576
            }
            break;
        default:
            c->function = i;
577 578
            c->flags = 0;
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
579 580
            break;
        }
A
aliguori 已提交
581
    }
582
    cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
A
aliguori 已提交
583 584

    for (i = 0x80000000; i <= limit; i++) {
585 586 587 588
        if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
            fprintf(stderr, "unsupported xlevel value: 0x%x\n", limit);
            abort();
        }
G
Gleb Natapov 已提交
589
        c = &cpuid_data.entries[cpuid_i++];
A
aliguori 已提交
590 591

        c->function = i;
592 593
        c->flags = 0;
        cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
A
aliguori 已提交
594 595
    }

596 597 598 599 600
    /* Call Centaur's CPUID instructions they are supported. */
    if (env->cpuid_xlevel2 > 0) {
        cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);

        for (i = 0xC0000000; i <= limit; i++) {
601 602 603 604
            if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
                fprintf(stderr, "unsupported xlevel2 value: 0x%x\n", limit);
                abort();
            }
605 606 607 608 609 610 611 612
            c = &cpuid_data.entries[cpuid_i++];

            c->function = i;
            c->flags = 0;
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
        }
    }

A
aliguori 已提交
613 614
    cpuid_data.cpuid.nent = cpuid_i;

M
Marcelo Tosatti 已提交
615
    if (((env->cpuid_version >> 8)&0xF) >= 6
616
        && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
617
           (CPUID_MCE | CPUID_MCA)
618
        && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
M
Marcelo Tosatti 已提交
619 620
        uint64_t mcg_cap;
        int banks;
J
Jan Kiszka 已提交
621
        int ret;
M
Marcelo Tosatti 已提交
622

623
        ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
624 625 626
        if (ret < 0) {
            fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
            return ret;
M
Marcelo Tosatti 已提交
627
        }
628 629 630 631 632 633

        if (banks > MCE_BANKS_DEF) {
            banks = MCE_BANKS_DEF;
        }
        mcg_cap &= MCE_CAP_DEF;
        mcg_cap |= banks;
634
        ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &mcg_cap);
635 636 637 638 639 640
        if (ret < 0) {
            fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
            return ret;
        }

        env->mcg_cap = mcg_cap;
M
Marcelo Tosatti 已提交
641 642
    }

643 644
    qemu_add_vm_change_state_handler(cpu_update_state, env);

645
    cpuid_data.cpuid.padding = 0;
646
    r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
647 648 649
    if (r) {
        return r;
    }
650

651
    r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL);
652
    if (r && env->tsc_khz) {
653
        r = kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz);
654 655 656 657 658 659
        if (r < 0) {
            fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");
            return r;
        }
    }

660 661 662 663
    if (kvm_has_xsave()) {
        env->kvm_xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
    }

664
    return 0;
A
aliguori 已提交
665 666
}

A
Andreas Färber 已提交
667
void kvm_arch_reset_vcpu(CPUState *cs)
J
Jan Kiszka 已提交
668
{
A
Andreas Färber 已提交
669 670
    X86CPU *cpu = X86_CPU(cs);
    CPUX86State *env = &cpu->env;
671

672
    env->exception_injected = -1;
673
    env->interrupt_injected = -1;
J
Jan Kiszka 已提交
674
    env->xcr0 = 1;
M
Marcelo Tosatti 已提交
675
    if (kvm_irqchip_in_kernel()) {
676
        env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
M
Marcelo Tosatti 已提交
677 678 679 680
                                          KVM_MP_STATE_UNINITIALIZED;
    } else {
        env->mp_state = KVM_MP_STATE_RUNNABLE;
    }
J
Jan Kiszka 已提交
681 682
}

683
static int kvm_get_supported_msrs(KVMState *s)
A
aliguori 已提交
684
{
M
Marcelo Tosatti 已提交
685
    static int kvm_supported_msrs;
686
    int ret = 0;
A
aliguori 已提交
687 688

    /* first time */
M
Marcelo Tosatti 已提交
689
    if (kvm_supported_msrs == 0) {
A
aliguori 已提交
690 691
        struct kvm_msr_list msr_list, *kvm_msr_list;

M
Marcelo Tosatti 已提交
692
        kvm_supported_msrs = -1;
A
aliguori 已提交
693 694 695

        /* Obtain MSR list from KVM.  These are the MSRs that we must
         * save/restore */
A
aliguori 已提交
696
        msr_list.nmsrs = 0;
697
        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
698
        if (ret < 0 && ret != -E2BIG) {
699
            return ret;
700
        }
701 702
        /* Old kernel modules had a bug and could write beyond the provided
           memory. Allocate at least a safe amount of 1K. */
703
        kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
704 705
                                              msr_list.nmsrs *
                                              sizeof(msr_list.indices[0])));
A
aliguori 已提交
706

707
        kvm_msr_list->nmsrs = msr_list.nmsrs;
708
        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
A
aliguori 已提交
709 710 711 712 713
        if (ret >= 0) {
            int i;

            for (i = 0; i < kvm_msr_list->nmsrs; i++) {
                if (kvm_msr_list->indices[i] == MSR_STAR) {
714
                    has_msr_star = true;
M
Marcelo Tosatti 已提交
715 716 717
                    continue;
                }
                if (kvm_msr_list->indices[i] == MSR_VM_HSAVE_PA) {
718
                    has_msr_hsave_pa = true;
M
Marcelo Tosatti 已提交
719
                    continue;
A
aliguori 已提交
720
                }
721 722 723 724
                if (kvm_msr_list->indices[i] == MSR_TSC_ADJUST) {
                    has_msr_tsc_adjust = true;
                    continue;
                }
725 726 727 728
                if (kvm_msr_list->indices[i] == MSR_IA32_TSCDEADLINE) {
                    has_msr_tsc_deadline = true;
                    continue;
                }
A
Avi Kivity 已提交
729 730 731 732
                if (kvm_msr_list->indices[i] == MSR_IA32_MISC_ENABLE) {
                    has_msr_misc_enable = true;
                    continue;
                }
A
aliguori 已提交
733 734 735
            }
        }

736
        g_free(kvm_msr_list);
A
aliguori 已提交
737 738
    }

739
    return ret;
A
aliguori 已提交
740 741
}

742
int kvm_arch_init(KVMState *s)
743
{
J
Jan Kiszka 已提交
744
    QemuOptsList *list = qemu_find_opts("machine");
745
    uint64_t identity_base = 0xfffbc000;
J
Jan Kiszka 已提交
746
    uint64_t shadow_mem;
747
    int ret;
748
    struct utsname utsname;
749

750
    ret = kvm_get_supported_msrs(s);
751 752 753
    if (ret < 0) {
        return ret;
    }
754 755 756 757

    uname(&utsname);
    lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;

J
Jes Sorensen 已提交
758
    /*
759 760 761 762 763 764 765 766 767
     * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
     * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
     * Since these must be part of guest physical memory, we need to allocate
     * them, both by setting their start addresses in the kernel and by
     * creating a corresponding e820 entry. We need 4 pages before the BIOS.
     *
     * Older KVM versions may not support setting the identity map base. In
     * that case we need to stick with the default, i.e. a 256K maximum BIOS
     * size.
J
Jes Sorensen 已提交
768
     */
769 770 771 772 773 774 775 776
    if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
        /* Allows up to 16M BIOSes. */
        identity_base = 0xfeffc000;

        ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
        if (ret < 0) {
            return ret;
        }
J
Jes Sorensen 已提交
777
    }
778

779 780
    /* Set TSS base one page after EPT identity map. */
    ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
781 782 783 784
    if (ret < 0) {
        return ret;
    }

785 786
    /* Tell fw_cfg to notify the BIOS to reserve the range. */
    ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
787
    if (ret < 0) {
788
        fprintf(stderr, "e820_add_entry() table is full\n");
789 790
        return ret;
    }
791
    qemu_register_reset(kvm_unpoison_all, NULL);
792

J
Jan Kiszka 已提交
793 794 795 796 797 798 799 800 801 802 803
    if (!QTAILQ_EMPTY(&list->head)) {
        shadow_mem = qemu_opt_get_size(QTAILQ_FIRST(&list->head),
                                       "kvm_shadow_mem", -1);
        if (shadow_mem != -1) {
            shadow_mem /= 4096;
            ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
            if (ret < 0) {
                return ret;
            }
        }
    }
804
    return 0;
A
aliguori 已提交
805
}
806

A
aliguori 已提交
807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830
static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
{
    lhs->selector = rhs->selector;
    lhs->base = rhs->base;
    lhs->limit = rhs->limit;
    lhs->type = 3;
    lhs->present = 1;
    lhs->dpl = 3;
    lhs->db = 0;
    lhs->s = 1;
    lhs->l = 0;
    lhs->g = 0;
    lhs->avl = 0;
    lhs->unusable = 0;
}

static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
{
    unsigned flags = rhs->flags;
    lhs->selector = rhs->selector;
    lhs->base = rhs->base;
    lhs->limit = rhs->limit;
    lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
    lhs->present = (flags & DESC_P_MASK) != 0;
831
    lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
A
aliguori 已提交
832 833 834 835 836 837
    lhs->db = (flags >> DESC_B_SHIFT) & 1;
    lhs->s = (flags & DESC_S_MASK) != 0;
    lhs->l = (flags >> DESC_L_SHIFT) & 1;
    lhs->g = (flags & DESC_G_MASK) != 0;
    lhs->avl = (flags & DESC_AVL_MASK) != 0;
    lhs->unusable = 0;
838
    lhs->padding = 0;
A
aliguori 已提交
839 840 841 842 843 844 845
}

static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
{
    lhs->selector = rhs->selector;
    lhs->base = rhs->base;
    lhs->limit = rhs->limit;
846 847 848 849 850 851 852 853
    lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
                 (rhs->present * DESC_P_MASK) |
                 (rhs->dpl << DESC_DPL_SHIFT) |
                 (rhs->db << DESC_B_SHIFT) |
                 (rhs->s * DESC_S_MASK) |
                 (rhs->l << DESC_L_SHIFT) |
                 (rhs->g * DESC_G_MASK) |
                 (rhs->avl * DESC_AVL_MASK);
A
aliguori 已提交
854 855 856 857
}

static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
{
858
    if (set) {
A
aliguori 已提交
859
        *kvm_reg = *qemu_reg;
860
    } else {
A
aliguori 已提交
861
        *qemu_reg = *kvm_reg;
862
    }
A
aliguori 已提交
863 864
}

865
static int kvm_getput_regs(X86CPU *cpu, int set)
A
aliguori 已提交
866
{
867
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
868 869 870 871
    struct kvm_regs regs;
    int ret = 0;

    if (!set) {
872
        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
873
        if (ret < 0) {
A
aliguori 已提交
874
            return ret;
875
        }
A
aliguori 已提交
876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899
    }

    kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
    kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
    kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
    kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
    kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
    kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
    kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
    kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
#ifdef TARGET_X86_64
    kvm_getput_reg(&regs.r8, &env->regs[8], set);
    kvm_getput_reg(&regs.r9, &env->regs[9], set);
    kvm_getput_reg(&regs.r10, &env->regs[10], set);
    kvm_getput_reg(&regs.r11, &env->regs[11], set);
    kvm_getput_reg(&regs.r12, &env->regs[12], set);
    kvm_getput_reg(&regs.r13, &env->regs[13], set);
    kvm_getput_reg(&regs.r14, &env->regs[14], set);
    kvm_getput_reg(&regs.r15, &env->regs[15], set);
#endif

    kvm_getput_reg(&regs.rflags, &env->eflags, set);
    kvm_getput_reg(&regs.rip, &env->eip, set);

900
    if (set) {
901
        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
902
    }
A
aliguori 已提交
903 904 905 906

    return ret;
}

907
static int kvm_put_fpu(X86CPU *cpu)
A
aliguori 已提交
908
{
909
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
910 911 912 913 914 915 916
    struct kvm_fpu fpu;
    int i;

    memset(&fpu, 0, sizeof fpu);
    fpu.fsw = env->fpus & ~(7 << 11);
    fpu.fsw |= (env->fpstt & 7) << 11;
    fpu.fcw = env->fpuc;
917 918 919
    fpu.last_opcode = env->fpop;
    fpu.last_ip = env->fpip;
    fpu.last_dp = env->fpdp;
920 921 922
    for (i = 0; i < 8; ++i) {
        fpu.ftwx |= (!env->fptags[i]) << i;
    }
A
aliguori 已提交
923 924 925 926
    memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
    memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs);
    fpu.mxcsr = env->mxcsr;

927
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, &fpu);
A
aliguori 已提交
928 929
}

930 931
#define XSAVE_FCW_FSW     0
#define XSAVE_FTW_FOP     1
932 933 934 935 936 937 938 939
#define XSAVE_CWD_RIP     2
#define XSAVE_CWD_RDP     4
#define XSAVE_MXCSR       6
#define XSAVE_ST_SPACE    8
#define XSAVE_XMM_SPACE   40
#define XSAVE_XSTATE_BV   128
#define XSAVE_YMMH_SPACE  144

940
static int kvm_put_xsave(X86CPU *cpu)
941
{
942
    CPUX86State *env = &cpu->env;
943
    struct kvm_xsave* xsave = env->kvm_xsave_buf;
944
    uint16_t cwd, swd, twd;
945
    int i, r;
946

947
    if (!kvm_has_xsave()) {
948
        return kvm_put_fpu(cpu);
949
    }
950 951

    memset(xsave, 0, sizeof(struct kvm_xsave));
B
Blue Swirl 已提交
952
    twd = 0;
953 954 955
    swd = env->fpus & ~(7 << 11);
    swd |= (env->fpstt & 7) << 11;
    cwd = env->fpuc;
956
    for (i = 0; i < 8; ++i) {
957
        twd |= (!env->fptags[i]) << i;
958
    }
959 960
    xsave->region[XSAVE_FCW_FSW] = (uint32_t)(swd << 16) + cwd;
    xsave->region[XSAVE_FTW_FOP] = (uint32_t)(env->fpop << 16) + twd;
961 962
    memcpy(&xsave->region[XSAVE_CWD_RIP], &env->fpip, sizeof(env->fpip));
    memcpy(&xsave->region[XSAVE_CWD_RDP], &env->fpdp, sizeof(env->fpdp));
963 964 965 966 967 968 969 970
    memcpy(&xsave->region[XSAVE_ST_SPACE], env->fpregs,
            sizeof env->fpregs);
    memcpy(&xsave->region[XSAVE_XMM_SPACE], env->xmm_regs,
            sizeof env->xmm_regs);
    xsave->region[XSAVE_MXCSR] = env->mxcsr;
    *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV] = env->xstate_bv;
    memcpy(&xsave->region[XSAVE_YMMH_SPACE], env->ymmh_regs,
            sizeof env->ymmh_regs);
971
    r = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
972
    return r;
973 974
}

975
static int kvm_put_xcrs(X86CPU *cpu)
976
{
977
    CPUX86State *env = &cpu->env;
978 979
    struct kvm_xcrs xcrs;

980
    if (!kvm_has_xcrs()) {
981
        return 0;
982
    }
983 984 985 986 987

    xcrs.nr_xcrs = 1;
    xcrs.flags = 0;
    xcrs.xcrs[0].xcr = 0;
    xcrs.xcrs[0].value = env->xcr0;
988
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
989 990
}

991
static int kvm_put_sregs(X86CPU *cpu)
A
aliguori 已提交
992
{
993
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
994 995
    struct kvm_sregs sregs;

996 997 998 999 1000
    memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
    if (env->interrupt_injected >= 0) {
        sregs.interrupt_bitmap[env->interrupt_injected / 64] |=
                (uint64_t)1 << (env->interrupt_injected % 64);
    }
A
aliguori 已提交
1001 1002

    if ((env->eflags & VM_MASK)) {
1003 1004 1005 1006 1007 1008
        set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
        set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
        set_v8086_seg(&sregs.es, &env->segs[R_ES]);
        set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
        set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
        set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
A
aliguori 已提交
1009
    } else {
1010 1011 1012 1013 1014 1015
        set_seg(&sregs.cs, &env->segs[R_CS]);
        set_seg(&sregs.ds, &env->segs[R_DS]);
        set_seg(&sregs.es, &env->segs[R_ES]);
        set_seg(&sregs.fs, &env->segs[R_FS]);
        set_seg(&sregs.gs, &env->segs[R_GS]);
        set_seg(&sregs.ss, &env->segs[R_SS]);
A
aliguori 已提交
1016 1017 1018 1019 1020 1021 1022
    }

    set_seg(&sregs.tr, &env->tr);
    set_seg(&sregs.ldt, &env->ldt);

    sregs.idt.limit = env->idt.limit;
    sregs.idt.base = env->idt.base;
1023
    memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
A
aliguori 已提交
1024 1025
    sregs.gdt.limit = env->gdt.limit;
    sregs.gdt.base = env->gdt.base;
1026
    memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
A
aliguori 已提交
1027 1028 1029 1030 1031 1032

    sregs.cr0 = env->cr[0];
    sregs.cr2 = env->cr[2];
    sregs.cr3 = env->cr[3];
    sregs.cr4 = env->cr[4];

1033 1034
    sregs.cr8 = cpu_get_apic_tpr(env->apic_state);
    sregs.apic_base = cpu_get_apic_base(env->apic_state);
A
aliguori 已提交
1035 1036 1037

    sregs.efer = env->efer;

1038
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
A
aliguori 已提交
1039 1040 1041 1042 1043 1044 1045 1046 1047
}

static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
                              uint32_t index, uint64_t value)
{
    entry->index = index;
    entry->data = value;
}

1048
static int kvm_put_msrs(X86CPU *cpu, int level)
A
aliguori 已提交
1049
{
1050
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
1051 1052 1053 1054 1055
    struct {
        struct kvm_msrs info;
        struct kvm_msr_entry entries[100];
    } msr_data;
    struct kvm_msr_entry *msrs = msr_data.entries;
H
Hidetoshi Seto 已提交
1056
    int n = 0;
A
aliguori 已提交
1057 1058 1059 1060

    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
1061
    kvm_msr_entry_set(&msrs[n++], MSR_PAT, env->pat);
1062
    if (has_msr_star) {
1063 1064
        kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star);
    }
1065
    if (has_msr_hsave_pa) {
M
Marcelo Tosatti 已提交
1066
        kvm_msr_entry_set(&msrs[n++], MSR_VM_HSAVE_PA, env->vm_hsave);
1067
    }
1068 1069 1070
    if (has_msr_tsc_adjust) {
        kvm_msr_entry_set(&msrs[n++], MSR_TSC_ADJUST, env->tsc_adjust);
    }
1071 1072 1073
    if (has_msr_tsc_deadline) {
        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSCDEADLINE, env->tsc_deadline);
    }
A
Avi Kivity 已提交
1074 1075 1076 1077
    if (has_msr_misc_enable) {
        kvm_msr_entry_set(&msrs[n++], MSR_IA32_MISC_ENABLE,
                          env->msr_ia32_misc_enable);
    }
A
aliguori 已提交
1078
#ifdef TARGET_X86_64
1079 1080 1081 1082 1083 1084
    if (lm_capable_kernel) {
        kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
        kvm_msr_entry_set(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
        kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask);
        kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
    }
A
aliguori 已提交
1085
#endif
1086
    if (level == KVM_PUT_FULL_STATE) {
1087 1088 1089 1090 1091 1092 1093 1094 1095
        /*
         * KVM is yet unable to synchronize TSC values of multiple VCPUs on
         * writeback. Until this is fixed, we only write the offset to SMP
         * guests after migration, desynchronizing the VCPUs, but avoiding
         * huge jump-backs that would occur without any writeback at all.
         */
        if (smp_cpus == 1 || env->tsc != 0) {
            kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
        }
J
Jan Kiszka 已提交
1096 1097 1098 1099 1100 1101 1102
    }
    /*
     * The following paravirtual MSRs have side effects on the guest or are
     * too heavy for normal writeback. Limit them to reset or full state
     * updates.
     */
    if (level >= KVM_PUT_RESET_STATE) {
1103 1104 1105
        kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
                          env->system_time_msr);
        kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
1106 1107 1108 1109
        if (has_msr_async_pf_en) {
            kvm_msr_entry_set(&msrs[n++], MSR_KVM_ASYNC_PF_EN,
                              env->async_pf_en_msr);
        }
M
Michael S. Tsirkin 已提交
1110 1111 1112 1113
        if (has_msr_pv_eoi_en) {
            kvm_msr_entry_set(&msrs[n++], MSR_KVM_PV_EOI_EN,
                              env->pv_eoi_en_msr);
        }
1114 1115 1116 1117
        if (has_msr_kvm_steal_time) {
            kvm_msr_entry_set(&msrs[n++], MSR_KVM_STEAL_TIME,
                              env->steal_time_msr);
        }
1118 1119 1120 1121 1122 1123 1124
        if (hyperv_hypercall_available()) {
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_GUEST_OS_ID, 0);
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_HYPERCALL, 0);
        }
        if (hyperv_vapic_recommended()) {
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_APIC_ASSIST_PAGE, 0);
        }
1125
    }
1126
    if (env->mcg_cap) {
H
Hidetoshi Seto 已提交
1127
        int i;
1128

1129 1130 1131 1132
        kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
        kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
            kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
1133 1134
        }
    }
1135

A
aliguori 已提交
1136 1137
    msr_data.info.nmsrs = n;

1138
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
A
aliguori 已提交
1139 1140 1141 1142

}


1143
static int kvm_get_fpu(X86CPU *cpu)
A
aliguori 已提交
1144
{
1145
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
1146 1147 1148
    struct kvm_fpu fpu;
    int i, ret;

1149
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_FPU, &fpu);
1150
    if (ret < 0) {
A
aliguori 已提交
1151
        return ret;
1152
    }
A
aliguori 已提交
1153 1154 1155 1156

    env->fpstt = (fpu.fsw >> 11) & 7;
    env->fpus = fpu.fsw;
    env->fpuc = fpu.fcw;
1157 1158 1159
    env->fpop = fpu.last_opcode;
    env->fpip = fpu.last_ip;
    env->fpdp = fpu.last_dp;
1160 1161 1162
    for (i = 0; i < 8; ++i) {
        env->fptags[i] = !((fpu.ftwx >> i) & 1);
    }
A
aliguori 已提交
1163 1164 1165 1166 1167 1168 1169
    memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
    memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs);
    env->mxcsr = fpu.mxcsr;

    return 0;
}

1170
static int kvm_get_xsave(X86CPU *cpu)
1171
{
1172
    CPUX86State *env = &cpu->env;
1173
    struct kvm_xsave* xsave = env->kvm_xsave_buf;
1174
    int ret, i;
1175
    uint16_t cwd, swd, twd;
1176

1177
    if (!kvm_has_xsave()) {
1178
        return kvm_get_fpu(cpu);
1179
    }
1180

1181
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XSAVE, xsave);
1182
    if (ret < 0) {
1183
        return ret;
1184
    }
1185

1186 1187 1188 1189
    cwd = (uint16_t)xsave->region[XSAVE_FCW_FSW];
    swd = (uint16_t)(xsave->region[XSAVE_FCW_FSW] >> 16);
    twd = (uint16_t)xsave->region[XSAVE_FTW_FOP];
    env->fpop = (uint16_t)(xsave->region[XSAVE_FTW_FOP] >> 16);
1190 1191 1192
    env->fpstt = (swd >> 11) & 7;
    env->fpus = swd;
    env->fpuc = cwd;
1193
    for (i = 0; i < 8; ++i) {
1194
        env->fptags[i] = !((twd >> i) & 1);
1195
    }
1196 1197
    memcpy(&env->fpip, &xsave->region[XSAVE_CWD_RIP], sizeof(env->fpip));
    memcpy(&env->fpdp, &xsave->region[XSAVE_CWD_RDP], sizeof(env->fpdp));
1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208
    env->mxcsr = xsave->region[XSAVE_MXCSR];
    memcpy(env->fpregs, &xsave->region[XSAVE_ST_SPACE],
            sizeof env->fpregs);
    memcpy(env->xmm_regs, &xsave->region[XSAVE_XMM_SPACE],
            sizeof env->xmm_regs);
    env->xstate_bv = *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV];
    memcpy(env->ymmh_regs, &xsave->region[XSAVE_YMMH_SPACE],
            sizeof env->ymmh_regs);
    return 0;
}

1209
static int kvm_get_xcrs(X86CPU *cpu)
1210
{
1211
    CPUX86State *env = &cpu->env;
1212 1213 1214
    int i, ret;
    struct kvm_xcrs xcrs;

1215
    if (!kvm_has_xcrs()) {
1216
        return 0;
1217
    }
1218

1219
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
1220
    if (ret < 0) {
1221
        return ret;
1222
    }
1223

1224
    for (i = 0; i < xcrs.nr_xcrs; i++) {
1225 1226 1227 1228 1229
        /* Only support xcr0 now */
        if (xcrs.xcrs[0].xcr == 0) {
            env->xcr0 = xcrs.xcrs[0].value;
            break;
        }
1230
    }
1231 1232 1233
    return 0;
}

1234
static int kvm_get_sregs(X86CPU *cpu)
A
aliguori 已提交
1235
{
1236
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
1237 1238
    struct kvm_sregs sregs;
    uint32_t hflags;
1239
    int bit, i, ret;
A
aliguori 已提交
1240

1241
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
1242
    if (ret < 0) {
A
aliguori 已提交
1243
        return ret;
1244
    }
A
aliguori 已提交
1245

1246 1247 1248 1249 1250 1251 1252 1253 1254 1255
    /* There can only be one pending IRQ set in the bitmap at a time, so try
       to find it and save its number instead (-1 for none). */
    env->interrupt_injected = -1;
    for (i = 0; i < ARRAY_SIZE(sregs.interrupt_bitmap); i++) {
        if (sregs.interrupt_bitmap[i]) {
            bit = ctz64(sregs.interrupt_bitmap[i]);
            env->interrupt_injected = i * 64 + bit;
            break;
        }
    }
A
aliguori 已提交
1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277

    get_seg(&env->segs[R_CS], &sregs.cs);
    get_seg(&env->segs[R_DS], &sregs.ds);
    get_seg(&env->segs[R_ES], &sregs.es);
    get_seg(&env->segs[R_FS], &sregs.fs);
    get_seg(&env->segs[R_GS], &sregs.gs);
    get_seg(&env->segs[R_SS], &sregs.ss);

    get_seg(&env->tr, &sregs.tr);
    get_seg(&env->ldt, &sregs.ldt);

    env->idt.limit = sregs.idt.limit;
    env->idt.base = sregs.idt.base;
    env->gdt.limit = sregs.gdt.limit;
    env->gdt.base = sregs.gdt.base;

    env->cr[0] = sregs.cr0;
    env->cr[2] = sregs.cr2;
    env->cr[3] = sregs.cr3;
    env->cr[4] = sregs.cr4;

    env->efer = sregs.efer;
1278 1279

    /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
A
aliguori 已提交
1280

1281 1282 1283 1284 1285
#define HFLAG_COPY_MASK \
    ~( HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
       HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
       HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
       HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
A
aliguori 已提交
1286 1287 1288 1289

    hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
    hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
    hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
1290
                (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
A
aliguori 已提交
1291 1292
    hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
    hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
1293
                (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
A
aliguori 已提交
1294 1295 1296 1297 1298 1299 1300 1301 1302

    if (env->efer & MSR_EFER_LMA) {
        hflags |= HF_LMA_MASK;
    }

    if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
        hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
    } else {
        hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
1303
                    (DESC_B_SHIFT - HF_CS32_SHIFT);
A
aliguori 已提交
1304
        hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
1305 1306 1307 1308 1309 1310 1311 1312
                    (DESC_B_SHIFT - HF_SS32_SHIFT);
        if (!(env->cr[0] & CR0_PE_MASK) || (env->eflags & VM_MASK) ||
            !(hflags & HF_CS32_MASK)) {
            hflags |= HF_ADDSEG_MASK;
        } else {
            hflags |= ((env->segs[R_DS].base | env->segs[R_ES].base |
                        env->segs[R_SS].base) != 0) << HF_ADDSEG_SHIFT;
        }
A
aliguori 已提交
1313 1314 1315 1316 1317 1318
    }
    env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;

    return 0;
}

1319
static int kvm_get_msrs(X86CPU *cpu)
A
aliguori 已提交
1320
{
1321
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332
    struct {
        struct kvm_msrs info;
        struct kvm_msr_entry entries[100];
    } msr_data;
    struct kvm_msr_entry *msrs = msr_data.entries;
    int ret, i, n;

    n = 0;
    msrs[n++].index = MSR_IA32_SYSENTER_CS;
    msrs[n++].index = MSR_IA32_SYSENTER_ESP;
    msrs[n++].index = MSR_IA32_SYSENTER_EIP;
1333
    msrs[n++].index = MSR_PAT;
1334
    if (has_msr_star) {
1335 1336
        msrs[n++].index = MSR_STAR;
    }
1337
    if (has_msr_hsave_pa) {
M
Marcelo Tosatti 已提交
1338
        msrs[n++].index = MSR_VM_HSAVE_PA;
1339
    }
1340 1341 1342
    if (has_msr_tsc_adjust) {
        msrs[n++].index = MSR_TSC_ADJUST;
    }
1343 1344 1345
    if (has_msr_tsc_deadline) {
        msrs[n++].index = MSR_IA32_TSCDEADLINE;
    }
A
Avi Kivity 已提交
1346 1347 1348
    if (has_msr_misc_enable) {
        msrs[n++].index = MSR_IA32_MISC_ENABLE;
    }
1349 1350 1351

    if (!env->tsc_valid) {
        msrs[n++].index = MSR_IA32_TSC;
1352
        env->tsc_valid = !runstate_is_running();
1353 1354
    }

A
aliguori 已提交
1355
#ifdef TARGET_X86_64
1356 1357 1358 1359 1360 1361
    if (lm_capable_kernel) {
        msrs[n++].index = MSR_CSTAR;
        msrs[n++].index = MSR_KERNELGSBASE;
        msrs[n++].index = MSR_FMASK;
        msrs[n++].index = MSR_LSTAR;
    }
A
aliguori 已提交
1362
#endif
1363 1364
    msrs[n++].index = MSR_KVM_SYSTEM_TIME;
    msrs[n++].index = MSR_KVM_WALL_CLOCK;
1365 1366 1367
    if (has_msr_async_pf_en) {
        msrs[n++].index = MSR_KVM_ASYNC_PF_EN;
    }
M
Michael S. Tsirkin 已提交
1368 1369 1370
    if (has_msr_pv_eoi_en) {
        msrs[n++].index = MSR_KVM_PV_EOI_EN;
    }
1371 1372 1373
    if (has_msr_kvm_steal_time) {
        msrs[n++].index = MSR_KVM_STEAL_TIME;
    }
1374

1375 1376 1377
    if (env->mcg_cap) {
        msrs[n++].index = MSR_MCG_STATUS;
        msrs[n++].index = MSR_MCG_CTL;
1378
        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
1379
            msrs[n++].index = MSR_MC0_CTL + i;
1380
        }
1381 1382
    }

A
aliguori 已提交
1383
    msr_data.info.nmsrs = n;
1384
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
1385
    if (ret < 0) {
A
aliguori 已提交
1386
        return ret;
1387
    }
A
aliguori 已提交
1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399

    for (i = 0; i < ret; i++) {
        switch (msrs[i].index) {
        case MSR_IA32_SYSENTER_CS:
            env->sysenter_cs = msrs[i].data;
            break;
        case MSR_IA32_SYSENTER_ESP:
            env->sysenter_esp = msrs[i].data;
            break;
        case MSR_IA32_SYSENTER_EIP:
            env->sysenter_eip = msrs[i].data;
            break;
1400 1401 1402
        case MSR_PAT:
            env->pat = msrs[i].data;
            break;
A
aliguori 已提交
1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422
        case MSR_STAR:
            env->star = msrs[i].data;
            break;
#ifdef TARGET_X86_64
        case MSR_CSTAR:
            env->cstar = msrs[i].data;
            break;
        case MSR_KERNELGSBASE:
            env->kernelgsbase = msrs[i].data;
            break;
        case MSR_FMASK:
            env->fmask = msrs[i].data;
            break;
        case MSR_LSTAR:
            env->lstar = msrs[i].data;
            break;
#endif
        case MSR_IA32_TSC:
            env->tsc = msrs[i].data;
            break;
1423 1424 1425
        case MSR_TSC_ADJUST:
            env->tsc_adjust = msrs[i].data;
            break;
1426 1427 1428
        case MSR_IA32_TSCDEADLINE:
            env->tsc_deadline = msrs[i].data;
            break;
1429 1430 1431
        case MSR_VM_HSAVE_PA:
            env->vm_hsave = msrs[i].data;
            break;
1432 1433 1434 1435 1436 1437
        case MSR_KVM_SYSTEM_TIME:
            env->system_time_msr = msrs[i].data;
            break;
        case MSR_KVM_WALL_CLOCK:
            env->wall_clock_msr = msrs[i].data;
            break;
1438 1439 1440 1441 1442 1443
        case MSR_MCG_STATUS:
            env->mcg_status = msrs[i].data;
            break;
        case MSR_MCG_CTL:
            env->mcg_ctl = msrs[i].data;
            break;
A
Avi Kivity 已提交
1444 1445 1446
        case MSR_IA32_MISC_ENABLE:
            env->msr_ia32_misc_enable = msrs[i].data;
            break;
1447 1448 1449 1450 1451
        default:
            if (msrs[i].index >= MSR_MC0_CTL &&
                msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
                env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
            }
H
Hidetoshi Seto 已提交
1452
            break;
1453 1454 1455
        case MSR_KVM_ASYNC_PF_EN:
            env->async_pf_en_msr = msrs[i].data;
            break;
M
Michael S. Tsirkin 已提交
1456 1457 1458
        case MSR_KVM_PV_EOI_EN:
            env->pv_eoi_en_msr = msrs[i].data;
            break;
1459 1460 1461
        case MSR_KVM_STEAL_TIME:
            env->steal_time_msr = msrs[i].data;
            break;
A
aliguori 已提交
1462 1463 1464 1465 1466 1467
        }
    }

    return 0;
}

1468
static int kvm_put_mp_state(X86CPU *cpu)
1469
{
1470
    struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
1471

1472
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
1473 1474
}

1475
static int kvm_get_mp_state(X86CPU *cpu)
1476
{
1477
    CPUState *cs = CPU(cpu);
1478
    CPUX86State *env = &cpu->env;
1479 1480 1481
    struct kvm_mp_state mp_state;
    int ret;

1482
    ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
1483 1484 1485 1486
    if (ret < 0) {
        return ret;
    }
    env->mp_state = mp_state.mp_state;
1487
    if (kvm_irqchip_in_kernel()) {
1488
        cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
1489
    }
1490 1491 1492
    return 0;
}

1493
static int kvm_get_apic(X86CPU *cpu)
1494
{
1495
    CPUX86State *env = &cpu->env;
1496 1497 1498 1499
    DeviceState *apic = env->apic_state;
    struct kvm_lapic_state kapic;
    int ret;

1500
    if (apic && kvm_irqchip_in_kernel()) {
1501
        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
1502 1503 1504 1505 1506 1507 1508 1509 1510
        if (ret < 0) {
            return ret;
        }

        kvm_get_apic_state(apic, &kapic);
    }
    return 0;
}

1511
static int kvm_put_apic(X86CPU *cpu)
1512
{
1513
    CPUX86State *env = &cpu->env;
1514 1515 1516
    DeviceState *apic = env->apic_state;
    struct kvm_lapic_state kapic;

1517
    if (apic && kvm_irqchip_in_kernel()) {
1518 1519
        kvm_put_apic_state(apic, &kapic);

1520
        return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_LAPIC, &kapic);
1521 1522 1523 1524
    }
    return 0;
}

1525
static int kvm_put_vcpu_events(X86CPU *cpu, int level)
1526
{
1527
    CPUX86State *env = &cpu->env;
1528 1529 1530 1531 1532 1533
    struct kvm_vcpu_events events;

    if (!kvm_has_vcpu_events()) {
        return 0;
    }

1534 1535
    events.exception.injected = (env->exception_injected >= 0);
    events.exception.nr = env->exception_injected;
1536 1537
    events.exception.has_error_code = env->has_error_code;
    events.exception.error_code = env->error_code;
1538
    events.exception.pad = 0;
1539 1540 1541 1542 1543 1544 1545 1546

    events.interrupt.injected = (env->interrupt_injected >= 0);
    events.interrupt.nr = env->interrupt_injected;
    events.interrupt.soft = env->soft_interrupt;

    events.nmi.injected = env->nmi_injected;
    events.nmi.pending = env->nmi_pending;
    events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
1547
    events.nmi.pad = 0;
1548 1549 1550

    events.sipi_vector = env->sipi_vector;

1551 1552 1553 1554 1555
    events.flags = 0;
    if (level >= KVM_PUT_RESET_STATE) {
        events.flags |=
            KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
    }
1556

1557
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
1558 1559
}

1560
static int kvm_get_vcpu_events(X86CPU *cpu)
1561
{
1562
    CPUX86State *env = &cpu->env;
1563 1564 1565 1566 1567 1568 1569
    struct kvm_vcpu_events events;
    int ret;

    if (!kvm_has_vcpu_events()) {
        return 0;
    }

1570
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
1571 1572 1573
    if (ret < 0) {
       return ret;
    }
1574
    env->exception_injected =
1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595
       events.exception.injected ? events.exception.nr : -1;
    env->has_error_code = events.exception.has_error_code;
    env->error_code = events.exception.error_code;

    env->interrupt_injected =
        events.interrupt.injected ? events.interrupt.nr : -1;
    env->soft_interrupt = events.interrupt.soft;

    env->nmi_injected = events.nmi.injected;
    env->nmi_pending = events.nmi.pending;
    if (events.nmi.masked) {
        env->hflags2 |= HF2_NMI_MASK;
    } else {
        env->hflags2 &= ~HF2_NMI_MASK;
    }

    env->sipi_vector = events.sipi_vector;

    return 0;
}

1596
static int kvm_guest_debug_workarounds(X86CPU *cpu)
1597
{
1598
    CPUX86State *env = &cpu->env;
1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625
    int ret = 0;
    unsigned long reinject_trap = 0;

    if (!kvm_has_vcpu_events()) {
        if (env->exception_injected == 1) {
            reinject_trap = KVM_GUESTDBG_INJECT_DB;
        } else if (env->exception_injected == 3) {
            reinject_trap = KVM_GUESTDBG_INJECT_BP;
        }
        env->exception_injected = -1;
    }

    /*
     * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
     * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
     * by updating the debug state once again if single-stepping is on.
     * Another reason to call kvm_update_guest_debug here is a pending debug
     * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
     * reinject them via SET_GUEST_DEBUG.
     */
    if (reinject_trap ||
        (!kvm_has_robust_singlestep() && env->singlestep_enabled)) {
        ret = kvm_update_guest_debug(env, reinject_trap);
    }
    return ret;
}

1626
static int kvm_put_debugregs(X86CPU *cpu)
1627
{
1628
    CPUX86State *env = &cpu->env;
1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
    struct kvm_debugregs dbgregs;
    int i;

    if (!kvm_has_debugregs()) {
        return 0;
    }

    for (i = 0; i < 4; i++) {
        dbgregs.db[i] = env->dr[i];
    }
    dbgregs.dr6 = env->dr[6];
    dbgregs.dr7 = env->dr[7];
    dbgregs.flags = 0;

1643
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs);
1644 1645
}

1646
static int kvm_get_debugregs(X86CPU *cpu)
1647
{
1648
    CPUX86State *env = &cpu->env;
1649 1650 1651 1652 1653 1654 1655
    struct kvm_debugregs dbgregs;
    int i, ret;

    if (!kvm_has_debugregs()) {
        return 0;
    }

1656
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs);
1657
    if (ret < 0) {
1658
        return ret;
1659 1660 1661 1662 1663 1664 1665 1666 1667 1668
    }
    for (i = 0; i < 4; i++) {
        env->dr[i] = dbgregs.db[i];
    }
    env->dr[4] = env->dr[6] = dbgregs.dr6;
    env->dr[5] = env->dr[7] = dbgregs.dr7;

    return 0;
}

A
Andreas Färber 已提交
1669
int kvm_arch_put_registers(CPUState *cpu, int level)
A
aliguori 已提交
1670
{
A
Andreas Färber 已提交
1671
    X86CPU *x86_cpu = X86_CPU(cpu);
A
aliguori 已提交
1672 1673
    int ret;

1674
    assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
1675

1676
    ret = kvm_getput_regs(x86_cpu, 1);
1677
    if (ret < 0) {
A
aliguori 已提交
1678
        return ret;
1679
    }
1680
    ret = kvm_put_xsave(x86_cpu);
1681
    if (ret < 0) {
1682
        return ret;
1683
    }
1684
    ret = kvm_put_xcrs(x86_cpu);
1685
    if (ret < 0) {
A
aliguori 已提交
1686
        return ret;
1687
    }
1688
    ret = kvm_put_sregs(x86_cpu);
1689
    if (ret < 0) {
A
aliguori 已提交
1690
        return ret;
1691
    }
1692
    /* must be before kvm_put_msrs */
1693
    ret = kvm_inject_mce_oldstyle(x86_cpu);
1694 1695 1696
    if (ret < 0) {
        return ret;
    }
1697
    ret = kvm_put_msrs(x86_cpu, level);
1698
    if (ret < 0) {
A
aliguori 已提交
1699
        return ret;
1700
    }
1701
    if (level >= KVM_PUT_RESET_STATE) {
1702
        ret = kvm_put_mp_state(x86_cpu);
1703
        if (ret < 0) {
1704
            return ret;
1705
        }
1706
        ret = kvm_put_apic(x86_cpu);
1707 1708 1709
        if (ret < 0) {
            return ret;
        }
1710
    }
1711
    ret = kvm_put_vcpu_events(x86_cpu, level);
1712
    if (ret < 0) {
1713
        return ret;
1714
    }
1715
    ret = kvm_put_debugregs(x86_cpu);
1716
    if (ret < 0) {
1717
        return ret;
1718
    }
1719
    /* must be last */
1720
    ret = kvm_guest_debug_workarounds(x86_cpu);
1721
    if (ret < 0) {
1722
        return ret;
1723
    }
A
aliguori 已提交
1724 1725 1726
    return 0;
}

A
Andreas Färber 已提交
1727
int kvm_arch_get_registers(CPUState *cs)
A
aliguori 已提交
1728
{
A
Andreas Färber 已提交
1729
    X86CPU *cpu = X86_CPU(cs);
A
aliguori 已提交
1730 1731
    int ret;

A
Andreas Färber 已提交
1732
    assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs));
1733

1734
    ret = kvm_getput_regs(cpu, 0);
1735
    if (ret < 0) {
A
aliguori 已提交
1736
        return ret;
1737
    }
1738
    ret = kvm_get_xsave(cpu);
1739
    if (ret < 0) {
1740
        return ret;
1741
    }
1742
    ret = kvm_get_xcrs(cpu);
1743
    if (ret < 0) {
A
aliguori 已提交
1744
        return ret;
1745
    }
1746
    ret = kvm_get_sregs(cpu);
1747
    if (ret < 0) {
A
aliguori 已提交
1748
        return ret;
1749
    }
1750
    ret = kvm_get_msrs(cpu);
1751
    if (ret < 0) {
A
aliguori 已提交
1752
        return ret;
1753
    }
1754
    ret = kvm_get_mp_state(cpu);
1755
    if (ret < 0) {
1756
        return ret;
1757
    }
1758
    ret = kvm_get_apic(cpu);
1759 1760 1761
    if (ret < 0) {
        return ret;
    }
1762
    ret = kvm_get_vcpu_events(cpu);
1763
    if (ret < 0) {
1764
        return ret;
1765
    }
1766
    ret = kvm_get_debugregs(cpu);
1767
    if (ret < 0) {
1768
        return ret;
1769
    }
A
aliguori 已提交
1770 1771 1772
    return 0;
}

A
Andreas Färber 已提交
1773
void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
A
aliguori 已提交
1774
{
A
Andreas Färber 已提交
1775 1776
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;
1777 1778
    int ret;

1779
    /* Inject NMI */
1780 1781
    if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
        cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1782
        DPRINTF("injected NMI\n");
1783
        ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
1784 1785 1786 1787
        if (ret < 0) {
            fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
                    strerror(-ret));
        }
1788 1789
    }

1790
    if (!kvm_irqchip_in_kernel()) {
1791 1792
        /* Force the VCPU out of its inner loop to process any INIT requests
         * or pending TPR access reports. */
1793
        if (cpu->interrupt_request &
1794
            (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1795
            cpu->exit_request = 1;
A
aliguori 已提交
1796 1797
        }

1798 1799
        /* Try to inject an interrupt if the guest can accept it */
        if (run->ready_for_interrupt_injection &&
1800
            (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1801 1802 1803
            (env->eflags & IF_MASK)) {
            int irq;

1804
            cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1805 1806 1807 1808 1809 1810
            irq = cpu_get_pic_interrupt(env);
            if (irq >= 0) {
                struct kvm_interrupt intr;

                intr.irq = irq;
                DPRINTF("injected interrupt %d\n", irq);
1811
                ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr);
1812 1813 1814 1815 1816
                if (ret < 0) {
                    fprintf(stderr,
                            "KVM: injection failed, interrupt lost (%s)\n",
                            strerror(-ret));
                }
1817 1818
            }
        }
A
aliguori 已提交
1819

1820 1821 1822 1823
        /* If we have an interrupt but the guest is not ready to receive an
         * interrupt, request an interrupt window exit.  This will
         * cause a return to userspace as soon as the guest is ready to
         * receive interrupts. */
1824
        if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1825 1826 1827 1828 1829 1830 1831 1832
            run->request_interrupt_window = 1;
        } else {
            run->request_interrupt_window = 0;
        }

        DPRINTF("setting tpr\n");
        run->cr8 = cpu_get_apic_tpr(env->apic_state);
    }
A
aliguori 已提交
1833 1834
}

A
Andreas Färber 已提交
1835
void kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
A
aliguori 已提交
1836
{
A
Andreas Färber 已提交
1837 1838 1839
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;

1840
    if (run->if_flag) {
A
aliguori 已提交
1841
        env->eflags |= IF_MASK;
1842
    } else {
A
aliguori 已提交
1843
        env->eflags &= ~IF_MASK;
1844
    }
1845 1846
    cpu_set_apic_tpr(env->apic_state, run->cr8);
    cpu_set_apic_base(env->apic_state, run->apic_base);
A
aliguori 已提交
1847 1848
}

A
Andreas Färber 已提交
1849
int kvm_arch_process_async_events(CPUState *cs)
M
Marcelo Tosatti 已提交
1850
{
A
Andreas Färber 已提交
1851 1852
    X86CPU *cpu = X86_CPU(cs);
    CPUX86State *env = &cpu->env;
1853

1854
    if (cs->interrupt_request & CPU_INTERRUPT_MCE) {
1855 1856 1857
        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
        assert(env->mcg_cap);

1858
        cs->interrupt_request &= ~CPU_INTERRUPT_MCE;
1859

1860
        kvm_cpu_synchronize_state(cs);
1861 1862 1863 1864

        if (env->exception_injected == EXCP08_DBLE) {
            /* this means triple fault */
            qemu_system_reset_request();
1865
            cs->exit_request = 1;
1866 1867 1868 1869 1870
            return 0;
        }
        env->exception_injected = EXCP12_MCHK;
        env->has_error_code = 0;

1871
        cs->halted = 0;
1872 1873 1874 1875 1876
        if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
            env->mp_state = KVM_MP_STATE_RUNNABLE;
        }
    }

1877 1878 1879 1880
    if (kvm_irqchip_in_kernel()) {
        return 0;
    }

1881 1882
    if (cs->interrupt_request & CPU_INTERRUPT_POLL) {
        cs->interrupt_request &= ~CPU_INTERRUPT_POLL;
1883 1884
        apic_poll_irq(env->apic_state);
    }
1885
    if (((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
1886
         (env->eflags & IF_MASK)) ||
1887 1888
        (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
        cs->halted = 0;
1889
    }
1890
    if (cs->interrupt_request & CPU_INTERRUPT_INIT) {
1891
        kvm_cpu_synchronize_state(cs);
1892
        do_cpu_init(cpu);
M
Marcelo Tosatti 已提交
1893
    }
1894
    if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
1895
        kvm_cpu_synchronize_state(cs);
1896
        do_cpu_sipi(cpu);
M
Marcelo Tosatti 已提交
1897
    }
1898 1899
    if (cs->interrupt_request & CPU_INTERRUPT_TPR) {
        cs->interrupt_request &= ~CPU_INTERRUPT_TPR;
1900
        kvm_cpu_synchronize_state(cs);
1901 1902 1903
        apic_handle_tpr_access_report(env->apic_state, env->eip,
                                      env->tpr_access_type);
    }
M
Marcelo Tosatti 已提交
1904

1905
    return cs->halted;
M
Marcelo Tosatti 已提交
1906 1907
}

1908
static int kvm_handle_halt(X86CPU *cpu)
A
aliguori 已提交
1909
{
1910
    CPUState *cs = CPU(cpu);
1911 1912
    CPUX86State *env = &cpu->env;

1913
    if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
A
aliguori 已提交
1914
          (env->eflags & IF_MASK)) &&
1915 1916
        !(cs->interrupt_request & CPU_INTERRUPT_NMI)) {
        cs->halted = 1;
1917
        return EXCP_HLT;
A
aliguori 已提交
1918 1919
    }

1920
    return 0;
A
aliguori 已提交
1921 1922
}

A
Andreas Färber 已提交
1923
static int kvm_handle_tpr_access(X86CPU *cpu)
1924
{
A
Andreas Färber 已提交
1925 1926 1927
    CPUX86State *env = &cpu->env;
    CPUState *cs = CPU(cpu);
    struct kvm_run *run = cs->kvm_run;
1928 1929 1930 1931 1932 1933 1934

    apic_handle_tpr_access_report(env->apic_state, run->tpr_access.rip,
                                  run->tpr_access.is_write ? TPR_ACCESS_WRITE
                                                           : TPR_ACCESS_READ);
    return 1;
}

A
Andreas Färber 已提交
1935
int kvm_arch_insert_sw_breakpoint(CPUState *cpu, struct kvm_sw_breakpoint *bp)
1936
{
A
Andreas Färber 已提交
1937
    CPUX86State *env = &X86_CPU(cpu)->env;
1938
    static const uint8_t int3 = 0xcc;
1939

1940
    if (cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
1941
        cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&int3, 1, 1)) {
1942
        return -EINVAL;
1943
    }
1944 1945 1946
    return 0;
}

A
Andreas Färber 已提交
1947
int kvm_arch_remove_sw_breakpoint(CPUState *cpu, struct kvm_sw_breakpoint *bp)
1948
{
A
Andreas Färber 已提交
1949
    CPUX86State *env = &X86_CPU(cpu)->env;
1950 1951 1952
    uint8_t int3;

    if (cpu_memory_rw_debug(env, bp->pc, &int3, 1, 0) || int3 != 0xcc ||
1953
        cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
1954
        return -EINVAL;
1955
    }
1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970
    return 0;
}

static struct {
    target_ulong addr;
    int len;
    int type;
} hw_breakpoint[4];

static int nb_hw_breakpoint;

static int find_hw_breakpoint(target_ulong addr, int len, int type)
{
    int n;

1971
    for (n = 0; n < nb_hw_breakpoint; n++) {
1972
        if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
1973
            (hw_breakpoint[n].len == len || len == -1)) {
1974
            return n;
1975 1976
        }
    }
1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994
    return -1;
}

int kvm_arch_insert_hw_breakpoint(target_ulong addr,
                                  target_ulong len, int type)
{
    switch (type) {
    case GDB_BREAKPOINT_HW:
        len = 1;
        break;
    case GDB_WATCHPOINT_WRITE:
    case GDB_WATCHPOINT_ACCESS:
        switch (len) {
        case 1:
            break;
        case 2:
        case 4:
        case 8:
1995
            if (addr & (len - 1)) {
1996
                return -EINVAL;
1997
            }
1998 1999 2000 2001 2002 2003 2004 2005 2006
            break;
        default:
            return -EINVAL;
        }
        break;
    default:
        return -ENOSYS;
    }

2007
    if (nb_hw_breakpoint == 4) {
2008
        return -ENOBUFS;
2009 2010
    }
    if (find_hw_breakpoint(addr, len, type) >= 0) {
2011
        return -EEXIST;
2012
    }
2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026
    hw_breakpoint[nb_hw_breakpoint].addr = addr;
    hw_breakpoint[nb_hw_breakpoint].len = len;
    hw_breakpoint[nb_hw_breakpoint].type = type;
    nb_hw_breakpoint++;

    return 0;
}

int kvm_arch_remove_hw_breakpoint(target_ulong addr,
                                  target_ulong len, int type)
{
    int n;

    n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
2027
    if (n < 0) {
2028
        return -ENOENT;
2029
    }
2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042
    nb_hw_breakpoint--;
    hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];

    return 0;
}

void kvm_arch_remove_all_hw_breakpoints(void)
{
    nb_hw_breakpoint = 0;
}

static CPUWatchpoint hw_watchpoint;

2043
static int kvm_handle_debug(X86CPU *cpu,
B
Blue Swirl 已提交
2044
                            struct kvm_debug_exit_arch *arch_info)
2045
{
2046
    CPUX86State *env = &cpu->env;
2047
    int ret = 0;
2048 2049 2050 2051
    int n;

    if (arch_info->exception == 1) {
        if (arch_info->dr6 & (1 << 14)) {
B
Blue Swirl 已提交
2052
            if (env->singlestep_enabled) {
2053
                ret = EXCP_DEBUG;
2054
            }
2055
        } else {
2056 2057
            for (n = 0; n < 4; n++) {
                if (arch_info->dr6 & (1 << n)) {
2058 2059
                    switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
                    case 0x0:
2060
                        ret = EXCP_DEBUG;
2061 2062
                        break;
                    case 0x1:
2063
                        ret = EXCP_DEBUG;
B
Blue Swirl 已提交
2064
                        env->watchpoint_hit = &hw_watchpoint;
2065 2066 2067 2068
                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
                        hw_watchpoint.flags = BP_MEM_WRITE;
                        break;
                    case 0x3:
2069
                        ret = EXCP_DEBUG;
B
Blue Swirl 已提交
2070
                        env->watchpoint_hit = &hw_watchpoint;
2071 2072 2073 2074
                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
                        hw_watchpoint.flags = BP_MEM_ACCESS;
                        break;
                    }
2075 2076
                }
            }
2077
        }
2078
    } else if (kvm_find_sw_breakpoint(CPU(cpu), arch_info->pc)) {
2079
        ret = EXCP_DEBUG;
2080
    }
2081
    if (ret == 0) {
2082
        cpu_synchronize_state(CPU(cpu));
B
Blue Swirl 已提交
2083
        assert(env->exception_injected == -1);
2084

2085
        /* pass to guest */
B
Blue Swirl 已提交
2086 2087
        env->exception_injected = arch_info->exception;
        env->has_error_code = 0;
2088
    }
2089

2090
    return ret;
2091 2092
}

A
Andreas Färber 已提交
2093
void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104
{
    const uint8_t type_code[] = {
        [GDB_BREAKPOINT_HW] = 0x0,
        [GDB_WATCHPOINT_WRITE] = 0x1,
        [GDB_WATCHPOINT_ACCESS] = 0x3
    };
    const uint8_t len_code[] = {
        [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
    };
    int n;

2105
    if (kvm_sw_breakpoints_active(cpu)) {
2106
        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
2107
    }
2108 2109 2110 2111 2112 2113 2114
    if (nb_hw_breakpoint > 0) {
        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
        dbg->arch.debugreg[7] = 0x0600;
        for (n = 0; n < nb_hw_breakpoint; n++) {
            dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
            dbg->arch.debugreg[7] |= (2 << (n * 2)) |
                (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
2115
                ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
2116 2117 2118
        }
    }
}
2119

2120 2121 2122 2123 2124 2125 2126 2127 2128 2129
static bool host_supports_vmx(void)
{
    uint32_t ecx, unused;

    host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
    return ecx & CPUID_EXT_VMX;
}

#define VMX_INVALID_GUEST_STATE 0x80000021

A
Andreas Färber 已提交
2130
int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
2131
{
A
Andreas Färber 已提交
2132
    X86CPU *cpu = X86_CPU(cs);
2133 2134 2135 2136 2137 2138
    uint64_t code;
    int ret;

    switch (run->exit_reason) {
    case KVM_EXIT_HLT:
        DPRINTF("handle_hlt\n");
2139
        ret = kvm_handle_halt(cpu);
2140 2141 2142 2143
        break;
    case KVM_EXIT_SET_TPR:
        ret = 0;
        break;
2144
    case KVM_EXIT_TPR_ACCESS:
A
Andreas Färber 已提交
2145
        ret = kvm_handle_tpr_access(cpu);
2146
        break;
2147 2148 2149 2150 2151 2152
    case KVM_EXIT_FAIL_ENTRY:
        code = run->fail_entry.hardware_entry_failure_reason;
        fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
                code);
        if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
            fprintf(stderr,
V
Vagrant Cascadian 已提交
2153
                    "\nIf you're running a guest on an Intel machine without "
2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168
                        "unrestricted mode\n"
                    "support, the failure can be most likely due to the guest "
                        "entering an invalid\n"
                    "state for Intel VT. For example, the guest maybe running "
                        "in big real mode\n"
                    "which is not supported on less recent Intel processors."
                        "\n\n");
        }
        ret = -1;
        break;
    case KVM_EXIT_EXCEPTION:
        fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
                run->ex.exception, run->ex.error_code);
        ret = -1;
        break;
2169 2170
    case KVM_EXIT_DEBUG:
        DPRINTF("kvm_exit_debug\n");
2171
        ret = kvm_handle_debug(cpu, &run->debug.arch);
2172
        break;
2173 2174 2175 2176 2177 2178 2179 2180 2181
    default:
        fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
        ret = -1;
        break;
    }

    return ret;
}

A
Andreas Färber 已提交
2182
bool kvm_arch_stop_on_emulation_error(CPUState *cs)
2183
{
A
Andreas Färber 已提交
2184 2185 2186
    X86CPU *cpu = X86_CPU(cs);
    CPUX86State *env = &cpu->env;

2187
    kvm_cpu_synchronize_state(cs);
2188 2189
    return !(env->cr[0] & CR0_PE_MASK) ||
           ((env->segs[R_CS].selector  & 3) != 3);
2190
}
2191 2192 2193 2194 2195 2196 2197 2198 2199 2200

void kvm_arch_init_irq_routing(KVMState *s)
{
    if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) {
        /* If kernel can't do irq routing, interrupt source
         * override 0->2 cannot be set up as required by HPET.
         * So we have to disable it.
         */
        no_hpet = 1;
    }
2201
    /* We know at this point that we're using the in-kernel
2202
     * irqchip, so we can use irqfds, and on x86 we know
2203
     * we can use msi via irqfd and GSI routing.
2204 2205
     */
    kvm_irqfds_allowed = true;
2206
    kvm_msi_via_irqfd_allowed = true;
2207
    kvm_gsi_routing_allowed = true;
2208
}
2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348

/* Classic KVM device assignment interface. Will remain x86 only. */
int kvm_device_pci_assign(KVMState *s, PCIHostDeviceAddress *dev_addr,
                          uint32_t flags, uint32_t *dev_id)
{
    struct kvm_assigned_pci_dev dev_data = {
        .segnr = dev_addr->domain,
        .busnr = dev_addr->bus,
        .devfn = PCI_DEVFN(dev_addr->slot, dev_addr->function),
        .flags = flags,
    };
    int ret;

    dev_data.assigned_dev_id =
        (dev_addr->domain << 16) | (dev_addr->bus << 8) | dev_data.devfn;

    ret = kvm_vm_ioctl(s, KVM_ASSIGN_PCI_DEVICE, &dev_data);
    if (ret < 0) {
        return ret;
    }

    *dev_id = dev_data.assigned_dev_id;

    return 0;
}

int kvm_device_pci_deassign(KVMState *s, uint32_t dev_id)
{
    struct kvm_assigned_pci_dev dev_data = {
        .assigned_dev_id = dev_id,
    };

    return kvm_vm_ioctl(s, KVM_DEASSIGN_PCI_DEVICE, &dev_data);
}

static int kvm_assign_irq_internal(KVMState *s, uint32_t dev_id,
                                   uint32_t irq_type, uint32_t guest_irq)
{
    struct kvm_assigned_irq assigned_irq = {
        .assigned_dev_id = dev_id,
        .guest_irq = guest_irq,
        .flags = irq_type,
    };

    if (kvm_check_extension(s, KVM_CAP_ASSIGN_DEV_IRQ)) {
        return kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ, &assigned_irq);
    } else {
        return kvm_vm_ioctl(s, KVM_ASSIGN_IRQ, &assigned_irq);
    }
}

int kvm_device_intx_assign(KVMState *s, uint32_t dev_id, bool use_host_msi,
                           uint32_t guest_irq)
{
    uint32_t irq_type = KVM_DEV_IRQ_GUEST_INTX |
        (use_host_msi ? KVM_DEV_IRQ_HOST_MSI : KVM_DEV_IRQ_HOST_INTX);

    return kvm_assign_irq_internal(s, dev_id, irq_type, guest_irq);
}

int kvm_device_intx_set_mask(KVMState *s, uint32_t dev_id, bool masked)
{
    struct kvm_assigned_pci_dev dev_data = {
        .assigned_dev_id = dev_id,
        .flags = masked ? KVM_DEV_ASSIGN_MASK_INTX : 0,
    };

    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_INTX_MASK, &dev_data);
}

static int kvm_deassign_irq_internal(KVMState *s, uint32_t dev_id,
                                     uint32_t type)
{
    struct kvm_assigned_irq assigned_irq = {
        .assigned_dev_id = dev_id,
        .flags = type,
    };

    return kvm_vm_ioctl(s, KVM_DEASSIGN_DEV_IRQ, &assigned_irq);
}

int kvm_device_intx_deassign(KVMState *s, uint32_t dev_id, bool use_host_msi)
{
    return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_INTX |
        (use_host_msi ? KVM_DEV_IRQ_HOST_MSI : KVM_DEV_IRQ_HOST_INTX));
}

int kvm_device_msi_assign(KVMState *s, uint32_t dev_id, int virq)
{
    return kvm_assign_irq_internal(s, dev_id, KVM_DEV_IRQ_HOST_MSI |
                                              KVM_DEV_IRQ_GUEST_MSI, virq);
}

int kvm_device_msi_deassign(KVMState *s, uint32_t dev_id)
{
    return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSI |
                                                KVM_DEV_IRQ_HOST_MSI);
}

bool kvm_device_msix_supported(KVMState *s)
{
    /* The kernel lacks a corresponding KVM_CAP, so we probe by calling
     * KVM_ASSIGN_SET_MSIX_NR with an invalid parameter. */
    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_NR, NULL) == -EFAULT;
}

int kvm_device_msix_init_vectors(KVMState *s, uint32_t dev_id,
                                 uint32_t nr_vectors)
{
    struct kvm_assigned_msix_nr msix_nr = {
        .assigned_dev_id = dev_id,
        .entry_nr = nr_vectors,
    };

    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_NR, &msix_nr);
}

int kvm_device_msix_set_vector(KVMState *s, uint32_t dev_id, uint32_t vector,
                               int virq)
{
    struct kvm_assigned_msix_entry msix_entry = {
        .assigned_dev_id = dev_id,
        .gsi = virq,
        .entry = vector,
    };

    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_ENTRY, &msix_entry);
}

int kvm_device_msix_assign(KVMState *s, uint32_t dev_id)
{
    return kvm_assign_irq_internal(s, dev_id, KVM_DEV_IRQ_HOST_MSIX |
                                              KVM_DEV_IRQ_GUEST_MSIX, 0);
}

int kvm_device_msix_deassign(KVMState *s, uint32_t dev_id)
{
    return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSIX |
                                                KVM_DEV_IRQ_HOST_MSIX);
}