kvm.c 74.3 KB
Newer Older
A
aliguori 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * QEMU KVM support
 *
 * Copyright (C) 2006-2008 Qumranet Technologies
 * Copyright IBM, Corp. 2008
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 *
 */

#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
18
#include <sys/utsname.h>
A
aliguori 已提交
19 20

#include <linux/kvm.h>
J
Jan Kiszka 已提交
21
#include <linux/kvm_para.h>
A
aliguori 已提交
22 23

#include "qemu-common.h"
24 25
#include "sysemu/sysemu.h"
#include "sysemu/kvm.h"
26
#include "kvm_i386.h"
A
aliguori 已提交
27
#include "cpu.h"
28
#include "exec/gdbstub.h"
29 30
#include "qemu/host-utils.h"
#include "qemu/config-file.h"
P
Paolo Bonzini 已提交
31 32
#include "hw/i386/pc.h"
#include "hw/i386/apic.h"
33 34
#include "hw/i386/apic_internal.h"
#include "hw/i386/apic-msidef.h"
35
#include "exec/ioport.h"
36
#include <asm/hyperv.h>
37
#include "hw/pci/pci.h"
A
aliguori 已提交
38 39 40 41

//#define DEBUG_KVM

#ifdef DEBUG_KVM
42
#define DPRINTF(fmt, ...) \
A
aliguori 已提交
43 44
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
#else
45
#define DPRINTF(fmt, ...) \
A
aliguori 已提交
46 47 48
    do { } while (0)
#endif

49 50 51
#define MSR_KVM_WALL_CLOCK  0x11
#define MSR_KVM_SYSTEM_TIME 0x12

M
Marcelo Tosatti 已提交
52 53 54 55 56 57 58
#ifndef BUS_MCEERR_AR
#define BUS_MCEERR_AR 4
#endif
#ifndef BUS_MCEERR_AO
#define BUS_MCEERR_AO 5
#endif

59 60 61 62 63 64
const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
    KVM_CAP_INFO(SET_TSS_ADDR),
    KVM_CAP_INFO(EXT_CPUID),
    KVM_CAP_INFO(MP_STATE),
    KVM_CAP_LAST_INFO
};
65

66 67
static bool has_msr_star;
static bool has_msr_hsave_pa;
68
static bool has_msr_tsc_adjust;
69
static bool has_msr_tsc_deadline;
70
static bool has_msr_feature_control;
71
static bool has_msr_async_pf_en;
M
Michael S. Tsirkin 已提交
72
static bool has_msr_pv_eoi_en;
A
Avi Kivity 已提交
73
static bool has_msr_misc_enable;
L
Liu Jinsong 已提交
74
static bool has_msr_bndcfgs;
75
static bool has_msr_kvm_steal_time;
76
static int lm_capable_kernel;
77 78
static bool has_msr_hv_hypercall;
static bool has_msr_hv_vapic;
79
static bool has_msr_hv_tsc;
80

P
Paolo Bonzini 已提交
81 82 83
static bool has_msr_architectural_pmu;
static uint32_t num_architectural_pmu_counters;

84 85 86 87 88
bool kvm_allows_irq0_override(void)
{
    return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
}

89 90 91 92 93 94
static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
{
    struct kvm_cpuid2 *cpuid;
    int r, size;

    size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
95
    cpuid = (struct kvm_cpuid2 *)g_malloc0(size);
96 97
    cpuid->nent = max;
    r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
98 99 100
    if (r == 0 && cpuid->nent >= max) {
        r = -E2BIG;
    }
101 102
    if (r < 0) {
        if (r == -E2BIG) {
103
            g_free(cpuid);
104 105 106 107 108 109 110 111 112 113
            return NULL;
        } else {
            fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
                    strerror(-r));
            exit(1);
        }
    }
    return cpuid;
}

114 115 116 117 118 119 120 121 122 123 124 125 126
/* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
 * for all entries.
 */
static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
{
    struct kvm_cpuid2 *cpuid;
    int max = 1;
    while ((cpuid = try_get_cpuid(s, max)) == NULL) {
        max *= 2;
    }
    return cpuid;
}

127
static const struct kvm_para_features {
128 129 130 131 132 133 134 135 136
    int cap;
    int feature;
} para_features[] = {
    { KVM_CAP_CLOCKSOURCE, KVM_FEATURE_CLOCKSOURCE },
    { KVM_CAP_NOP_IO_DELAY, KVM_FEATURE_NOP_IO_DELAY },
    { KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP },
    { KVM_CAP_ASYNC_PF, KVM_FEATURE_ASYNC_PF },
};

137
static int get_para_features(KVMState *s)
138 139 140
{
    int i, features = 0;

141
    for (i = 0; i < ARRAY_SIZE(para_features); i++) {
142
        if (kvm_check_extension(s, para_features[i].cap)) {
143 144 145 146 147 148 149 150
            features |= (1 << para_features[i].feature);
        }
    }

    return features;
}


151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
/* Returns the value for a specific register on the cpuid entry
 */
static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
{
    uint32_t ret = 0;
    switch (reg) {
    case R_EAX:
        ret = entry->eax;
        break;
    case R_EBX:
        ret = entry->ebx;
        break;
    case R_ECX:
        ret = entry->ecx;
        break;
    case R_EDX:
        ret = entry->edx;
        break;
    }
    return ret;
}

173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
/* Find matching entry for function/index on kvm_cpuid2 struct
 */
static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
                                                 uint32_t function,
                                                 uint32_t index)
{
    int i;
    for (i = 0; i < cpuid->nent; ++i) {
        if (cpuid->entries[i].function == function &&
            cpuid->entries[i].index == index) {
            return &cpuid->entries[i];
        }
    }
    /* not found: */
    return NULL;
}

190
uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
191
                                      uint32_t index, int reg)
192 193 194 195
{
    struct kvm_cpuid2 *cpuid;
    uint32_t ret = 0;
    uint32_t cpuid_1_edx;
196
    bool found = false;
197

198
    cpuid = get_supported_cpuid(s);
199

200 201 202 203
    struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
    if (entry) {
        found = true;
        ret = cpuid_entry_get_reg(entry, reg);
204 205
    }

206 207
    /* Fixups for the data returned by KVM, below */

208 209 210
    if (function == 1 && reg == R_EDX) {
        /* KVM before 2.6.30 misreports the following features */
        ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
211 212 213 214 215
    } else if (function == 1 && reg == R_ECX) {
        /* We can set the hypervisor flag, even if KVM does not return it on
         * GET_SUPPORTED_CPUID
         */
        ret |= CPUID_EXT_HYPERVISOR;
216 217 218 219 220 221 222 223
        /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
         * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
         * and the irqchip is in the kernel.
         */
        if (kvm_irqchip_in_kernel() &&
                kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
            ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
        }
224 225 226 227 228 229

        /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
         * without the in-kernel irqchip
         */
        if (!kvm_irqchip_in_kernel()) {
            ret &= ~CPUID_EXT_X2APIC;
230
        }
231 232 233 234 235 236
    } else if (function == 0x80000001 && reg == R_EDX) {
        /* On Intel, kvm returns cpuid according to the Intel spec,
         * so add missing bits according to the AMD spec:
         */
        cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
        ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
237 238
    }

239
    g_free(cpuid);
240

241
    /* fallback for older kernels */
242
    if ((function == KVM_CPUID_FEATURES) && !found) {
243
        ret = get_para_features(s);
244
    }
245 246

    return ret;
G
Gleb Natapov 已提交
247 248
}

249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
typedef struct HWPoisonPage {
    ram_addr_t ram_addr;
    QLIST_ENTRY(HWPoisonPage) list;
} HWPoisonPage;

static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
    QLIST_HEAD_INITIALIZER(hwpoison_page_list);

static void kvm_unpoison_all(void *param)
{
    HWPoisonPage *page, *next_page;

    QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
        QLIST_REMOVE(page, list);
        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
264
        g_free(page);
265 266 267 268 269 270 271 272 273 274 275 276
    }
}

static void kvm_hwpoison_page_add(ram_addr_t ram_addr)
{
    HWPoisonPage *page;

    QLIST_FOREACH(page, &hwpoison_page_list, list) {
        if (page->ram_addr == ram_addr) {
            return;
        }
    }
277
    page = g_malloc(sizeof(HWPoisonPage));
278 279 280 281
    page->ram_addr = ram_addr;
    QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
}

M
Marcelo Tosatti 已提交
282 283 284 285 286
static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
                                     int *max_banks)
{
    int r;

287
    r = kvm_check_extension(s, KVM_CAP_MCE);
M
Marcelo Tosatti 已提交
288 289 290 291 292 293 294
    if (r > 0) {
        *max_banks = r;
        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
    }
    return -ENOSYS;
}

295
static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
M
Marcelo Tosatti 已提交
296
{
297
    CPUX86State *env = &cpu->env;
298 299 300
    uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
                      MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
    uint64_t mcg_status = MCG_STATUS_MCIP;
M
Marcelo Tosatti 已提交
301

302 303 304 305 306 307
    if (code == BUS_MCEERR_AR) {
        status |= MCI_STATUS_AR | 0x134;
        mcg_status |= MCG_STATUS_EIPV;
    } else {
        status |= 0xc0;
        mcg_status |= MCG_STATUS_RIPV;
308
    }
309
    cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
310 311 312
                       (MCM_ADDR_PHYS << 6) | 0xc,
                       cpu_x86_support_mca_broadcast(env) ?
                       MCE_INJECT_BROADCAST : 0);
313 314 315 316 317 318 319 320
}

static void hardware_memory_error(void)
{
    fprintf(stderr, "Hardware memory error!\n");
    exit(1);
}

A
Andreas Färber 已提交
321
int kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
322
{
A
Andreas Färber 已提交
323 324
    X86CPU *cpu = X86_CPU(c);
    CPUX86State *env = &cpu->env;
325
    ram_addr_t ram_addr;
A
Avi Kivity 已提交
326
    hwaddr paddr;
327 328

    if ((env->mcg_cap & MCG_SER_P) && addr
329
        && (code == BUS_MCEERR_AR || code == BUS_MCEERR_AO)) {
330
        if (qemu_ram_addr_from_host(addr, &ram_addr) == NULL ||
331
            !kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
332 333 334 335 336 337 338 339 340
            fprintf(stderr, "Hardware memory error for memory used by "
                    "QEMU itself instead of guest system!\n");
            /* Hope we are lucky for AO MCE */
            if (code == BUS_MCEERR_AO) {
                return 0;
            } else {
                hardware_memory_error();
            }
        }
341
        kvm_hwpoison_page_add(ram_addr);
342
        kvm_mce_inject(cpu, paddr, code);
343
    } else {
344 345 346 347 348 349 350 351 352 353 354 355 356
        if (code == BUS_MCEERR_AO) {
            return 0;
        } else if (code == BUS_MCEERR_AR) {
            hardware_memory_error();
        } else {
            return 1;
        }
    }
    return 0;
}

int kvm_arch_on_sigbus(int code, void *addr)
{
357 358 359
    X86CPU *cpu = X86_CPU(first_cpu);

    if ((cpu->env.mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) {
360
        ram_addr_t ram_addr;
A
Avi Kivity 已提交
361
        hwaddr paddr;
362 363

        /* Hope we are lucky for AO MCE */
364
        if (qemu_ram_addr_from_host(addr, &ram_addr) == NULL ||
365
            !kvm_physical_memory_addr_from_host(first_cpu->kvm_state,
366
                                                addr, &paddr)) {
367 368 369 370
            fprintf(stderr, "Hardware memory error for memory used by "
                    "QEMU itself instead of guest system!: %p\n", addr);
            return 0;
        }
371
        kvm_hwpoison_page_add(ram_addr);
372
        kvm_mce_inject(X86_CPU(first_cpu), paddr, code);
373
    } else {
374 375 376 377 378 379 380 381 382 383
        if (code == BUS_MCEERR_AO) {
            return 0;
        } else if (code == BUS_MCEERR_AR) {
            hardware_memory_error();
        } else {
            return 1;
        }
    }
    return 0;
}
M
Marcelo Tosatti 已提交
384

385
static int kvm_inject_mce_oldstyle(X86CPU *cpu)
386
{
387 388
    CPUX86State *env = &cpu->env;

389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
    if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) {
        unsigned int bank, bank_num = env->mcg_cap & 0xff;
        struct kvm_x86_mce mce;

        env->exception_injected = -1;

        /*
         * There must be at least one bank in use if an MCE is pending.
         * Find it and use its values for the event injection.
         */
        for (bank = 0; bank < bank_num; bank++) {
            if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
                break;
            }
        }
        assert(bank < bank_num);

        mce.bank = bank;
        mce.status = env->mce_banks[bank * 4 + 1];
        mce.mcg_status = env->mcg_status;
        mce.addr = env->mce_banks[bank * 4 + 2];
        mce.misc = env->mce_banks[bank * 4 + 3];

412
        return kvm_vcpu_ioctl(CPU(cpu), KVM_X86_SET_MCE, &mce);
413 414 415 416
    }
    return 0;
}

417
static void cpu_update_state(void *opaque, int running, RunState state)
418
{
419
    CPUX86State *env = opaque;
420 421 422 423 424 425

    if (running) {
        env->tsc_valid = false;
    }
}

426
unsigned long kvm_arch_vcpu_id(CPUState *cs)
427
{
428 429
    X86CPU *cpu = X86_CPU(cs);
    return cpu->env.cpuid_apic_id;
430 431
}

432 433 434 435 436 437 438 439 440 441 442 443
#ifndef KVM_CPUID_SIGNATURE_NEXT
#define KVM_CPUID_SIGNATURE_NEXT                0x40000100
#endif

static bool hyperv_hypercall_available(X86CPU *cpu)
{
    return cpu->hyperv_vapic ||
           (cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_RETRY);
}

static bool hyperv_enabled(X86CPU *cpu)
{
444 445 446
    CPUState *cs = CPU(cpu);
    return kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0 &&
           (hyperv_hypercall_available(cpu) ||
447
            cpu->hyperv_time  ||
448
            cpu->hyperv_relaxed_timing);
449 450
}

451
#define KVM_MAX_CPUID_ENTRIES  100
452

A
Andreas Färber 已提交
453
int kvm_arch_init_vcpu(CPUState *cs)
A
aliguori 已提交
454 455
{
    struct {
456
        struct kvm_cpuid2 cpuid;
457
        struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
458
    } QEMU_PACKED cpuid_data;
A
Andreas Färber 已提交
459 460
    X86CPU *cpu = X86_CPU(cs);
    CPUX86State *env = &cpu->env;
461
    uint32_t limit, i, j, cpuid_i;
462
    uint32_t unused;
G
Gleb Natapov 已提交
463 464
    struct kvm_cpuid_entry2 *c;
    uint32_t signature[3];
465
    int kvm_base = KVM_CPUID_SIGNATURE;
466
    int r;
A
aliguori 已提交
467

S
Stefan Weil 已提交
468 469
    memset(&cpuid_data, 0, sizeof(cpuid_data));

A
aliguori 已提交
470 471
    cpuid_i = 0;

G
Gleb Natapov 已提交
472
    /* Paravirtualization CPUIDs */
473 474 475
    if (hyperv_enabled(cpu)) {
        c = &cpuid_data.entries[cpuid_i++];
        c->function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
476 477
        memcpy(signature, "Microsoft Hv", 12);
        c->eax = HYPERV_CPUID_MIN;
478 479 480
        c->ebx = signature[0];
        c->ecx = signature[1];
        c->edx = signature[2];
481

482 483
        c = &cpuid_data.entries[cpuid_i++];
        c->function = HYPERV_CPUID_INTERFACE;
484 485
        memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
        c->eax = signature[0];
486 487 488
        c->ebx = 0;
        c->ecx = 0;
        c->edx = 0;
489 490 491 492 493 494 495 496

        c = &cpuid_data.entries[cpuid_i++];
        c->function = HYPERV_CPUID_VERSION;
        c->eax = 0x00001bbc;
        c->ebx = 0x00060001;

        c = &cpuid_data.entries[cpuid_i++];
        c->function = HYPERV_CPUID_FEATURES;
497
        if (cpu->hyperv_relaxed_timing) {
498 499
            c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
        }
500
        if (cpu->hyperv_vapic) {
501 502
            c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
            c->eax |= HV_X64_MSR_APIC_ACCESS_AVAILABLE;
503
            has_msr_hv_vapic = true;
504
        }
505 506 507 508 509 510 511
        if (cpu->hyperv_time &&
            kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) {
            c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
            c->eax |= HV_X64_MSR_TIME_REF_COUNT_AVAILABLE;
            c->eax |= 0x200;
            has_msr_hv_tsc = true;
        }
512 513
        c = &cpuid_data.entries[cpuid_i++];
        c->function = HYPERV_CPUID_ENLIGHTMENT_INFO;
514
        if (cpu->hyperv_relaxed_timing) {
515 516
            c->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
        }
517
        if (has_msr_hv_vapic) {
518 519
            c->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
        }
520
        c->ebx = cpu->hyperv_spinlock_attempts;
521 522 523 524 525 526

        c = &cpuid_data.entries[cpuid_i++];
        c->function = HYPERV_CPUID_IMPLEMENT_LIMITS;
        c->eax = 0x40;
        c->ebx = 0x40;

527
        kvm_base = KVM_CPUID_SIGNATURE_NEXT;
528
        has_msr_hv_hypercall = true;
529 530
    }

531 532 533 534
    if (cpu->expose_kvm) {
        memcpy(signature, "KVMKVMKVM\0\0\0", 12);
        c = &cpuid_data.entries[cpuid_i++];
        c->function = KVM_CPUID_SIGNATURE | kvm_base;
535
        c->eax = KVM_CPUID_FEATURES | kvm_base;
536 537 538
        c->ebx = signature[0];
        c->ecx = signature[1];
        c->edx = signature[2];
539

540 541 542
        c = &cpuid_data.entries[cpuid_i++];
        c->function = KVM_CPUID_FEATURES | kvm_base;
        c->eax = env->features[FEAT_KVM];
543

544
        has_msr_async_pf_en = c->eax & (1 << KVM_FEATURE_ASYNC_PF);
G
Gleb Natapov 已提交
545

546
        has_msr_pv_eoi_en = c->eax & (1 << KVM_FEATURE_PV_EOI);
M
Michael S. Tsirkin 已提交
547

548 549
        has_msr_kvm_steal_time = c->eax & (1 << KVM_FEATURE_STEAL_TIME);
    }
550

551
    cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
A
aliguori 已提交
552 553

    for (i = 0; i <= limit; i++) {
554 555 556 557
        if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
            fprintf(stderr, "unsupported level value: 0x%x\n", limit);
            abort();
        }
G
Gleb Natapov 已提交
558
        c = &cpuid_data.entries[cpuid_i++];
559 560

        switch (i) {
561 562 563 564 565
        case 2: {
            /* Keep reading function 2 till all the input is received */
            int times;

            c->function = i;
566 567 568 569
            c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
                       KVM_CPUID_FLAG_STATE_READ_NEXT;
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
            times = c->eax & 0xff;
570 571

            for (j = 1; j < times; ++j) {
572 573 574 575 576
                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
                    fprintf(stderr, "cpuid_data is full, no space for "
                            "cpuid(eax:2):eax & 0xf = 0x%x\n", times);
                    abort();
                }
577
                c = &cpuid_data.entries[cpuid_i++];
578
                c->function = i;
579 580
                c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
                cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
581 582 583
            }
            break;
        }
584 585 586 587
        case 4:
        case 0xb:
        case 0xd:
            for (j = 0; ; j++) {
588 589 590
                if (i == 0xd && j == 64) {
                    break;
                }
591 592 593
                c->function = i;
                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                c->index = j;
594
                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
595

596
                if (i == 4 && c->eax == 0) {
597
                    break;
598 599
                }
                if (i == 0xb && !(c->ecx & 0xff00)) {
600
                    break;
601 602
                }
                if (i == 0xd && c->eax == 0) {
603
                    continue;
604
                }
605 606 607 608 609
                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
                    fprintf(stderr, "cpuid_data is full, no space for "
                            "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
                    abort();
                }
610
                c = &cpuid_data.entries[cpuid_i++];
611 612 613 614
            }
            break;
        default:
            c->function = i;
615 616
            c->flags = 0;
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
617 618
            break;
        }
A
aliguori 已提交
619
    }
P
Paolo Bonzini 已提交
620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638

    if (limit >= 0x0a) {
        uint32_t ver;

        cpu_x86_cpuid(env, 0x0a, 0, &ver, &unused, &unused, &unused);
        if ((ver & 0xff) > 0) {
            has_msr_architectural_pmu = true;
            num_architectural_pmu_counters = (ver & 0xff00) >> 8;

            /* Shouldn't be more than 32, since that's the number of bits
             * available in EBX to tell us _which_ counters are available.
             * Play it safe.
             */
            if (num_architectural_pmu_counters > MAX_GP_COUNTERS) {
                num_architectural_pmu_counters = MAX_GP_COUNTERS;
            }
        }
    }

639
    cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
A
aliguori 已提交
640 641

    for (i = 0x80000000; i <= limit; i++) {
642 643 644 645
        if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
            fprintf(stderr, "unsupported xlevel value: 0x%x\n", limit);
            abort();
        }
G
Gleb Natapov 已提交
646
        c = &cpuid_data.entries[cpuid_i++];
A
aliguori 已提交
647 648

        c->function = i;
649 650
        c->flags = 0;
        cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
A
aliguori 已提交
651 652
    }

653 654 655 656 657
    /* Call Centaur's CPUID instructions they are supported. */
    if (env->cpuid_xlevel2 > 0) {
        cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);

        for (i = 0xC0000000; i <= limit; i++) {
658 659 660 661
            if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
                fprintf(stderr, "unsupported xlevel2 value: 0x%x\n", limit);
                abort();
            }
662 663 664 665 666 667 668 669
            c = &cpuid_data.entries[cpuid_i++];

            c->function = i;
            c->flags = 0;
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
        }
    }

A
aliguori 已提交
670 671
    cpuid_data.cpuid.nent = cpuid_i;

M
Marcelo Tosatti 已提交
672
    if (((env->cpuid_version >> 8)&0xF) >= 6
673
        && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
674
           (CPUID_MCE | CPUID_MCA)
675
        && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
M
Marcelo Tosatti 已提交
676 677
        uint64_t mcg_cap;
        int banks;
J
Jan Kiszka 已提交
678
        int ret;
M
Marcelo Tosatti 已提交
679

680
        ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
681 682 683
        if (ret < 0) {
            fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
            return ret;
M
Marcelo Tosatti 已提交
684
        }
685 686 687 688 689 690

        if (banks > MCE_BANKS_DEF) {
            banks = MCE_BANKS_DEF;
        }
        mcg_cap &= MCE_CAP_DEF;
        mcg_cap |= banks;
691
        ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &mcg_cap);
692 693 694 695 696 697
        if (ret < 0) {
            fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
            return ret;
        }

        env->mcg_cap = mcg_cap;
M
Marcelo Tosatti 已提交
698 699
    }

700 701
    qemu_add_vm_change_state_handler(cpu_update_state, env);

702 703 704 705 706 707
    c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
    if (c) {
        has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
                                  !!(c->ecx & CPUID_EXT_SMX);
    }

708
    cpuid_data.cpuid.padding = 0;
709
    r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
710 711 712
    if (r) {
        return r;
    }
713

714
    r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL);
715
    if (r && env->tsc_khz) {
716
        r = kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz);
717 718 719 720 721 722
        if (r < 0) {
            fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");
            return r;
        }
    }

723 724 725 726
    if (kvm_has_xsave()) {
        env->kvm_xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
    }

727
    return 0;
A
aliguori 已提交
728 729
}

730
void kvm_arch_reset_vcpu(X86CPU *cpu)
J
Jan Kiszka 已提交
731
{
A
Andreas Färber 已提交
732
    CPUX86State *env = &cpu->env;
733

734
    env->exception_injected = -1;
735
    env->interrupt_injected = -1;
J
Jan Kiszka 已提交
736
    env->xcr0 = 1;
M
Marcelo Tosatti 已提交
737
    if (kvm_irqchip_in_kernel()) {
738
        env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
M
Marcelo Tosatti 已提交
739 740 741 742
                                          KVM_MP_STATE_UNINITIALIZED;
    } else {
        env->mp_state = KVM_MP_STATE_RUNNABLE;
    }
J
Jan Kiszka 已提交
743 744
}

745 746 747 748 749 750 751 752 753 754
void kvm_arch_do_init_vcpu(X86CPU *cpu)
{
    CPUX86State *env = &cpu->env;

    /* APs get directly into wait-for-SIPI state.  */
    if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) {
        env->mp_state = KVM_MP_STATE_INIT_RECEIVED;
    }
}

755
static int kvm_get_supported_msrs(KVMState *s)
A
aliguori 已提交
756
{
M
Marcelo Tosatti 已提交
757
    static int kvm_supported_msrs;
758
    int ret = 0;
A
aliguori 已提交
759 760

    /* first time */
M
Marcelo Tosatti 已提交
761
    if (kvm_supported_msrs == 0) {
A
aliguori 已提交
762 763
        struct kvm_msr_list msr_list, *kvm_msr_list;

M
Marcelo Tosatti 已提交
764
        kvm_supported_msrs = -1;
A
aliguori 已提交
765 766 767

        /* Obtain MSR list from KVM.  These are the MSRs that we must
         * save/restore */
A
aliguori 已提交
768
        msr_list.nmsrs = 0;
769
        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
770
        if (ret < 0 && ret != -E2BIG) {
771
            return ret;
772
        }
773 774
        /* Old kernel modules had a bug and could write beyond the provided
           memory. Allocate at least a safe amount of 1K. */
775
        kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
776 777
                                              msr_list.nmsrs *
                                              sizeof(msr_list.indices[0])));
A
aliguori 已提交
778

779
        kvm_msr_list->nmsrs = msr_list.nmsrs;
780
        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
A
aliguori 已提交
781 782 783 784 785
        if (ret >= 0) {
            int i;

            for (i = 0; i < kvm_msr_list->nmsrs; i++) {
                if (kvm_msr_list->indices[i] == MSR_STAR) {
786
                    has_msr_star = true;
M
Marcelo Tosatti 已提交
787 788 789
                    continue;
                }
                if (kvm_msr_list->indices[i] == MSR_VM_HSAVE_PA) {
790
                    has_msr_hsave_pa = true;
M
Marcelo Tosatti 已提交
791
                    continue;
A
aliguori 已提交
792
                }
793 794 795 796
                if (kvm_msr_list->indices[i] == MSR_TSC_ADJUST) {
                    has_msr_tsc_adjust = true;
                    continue;
                }
797 798 799 800
                if (kvm_msr_list->indices[i] == MSR_IA32_TSCDEADLINE) {
                    has_msr_tsc_deadline = true;
                    continue;
                }
A
Avi Kivity 已提交
801 802 803 804
                if (kvm_msr_list->indices[i] == MSR_IA32_MISC_ENABLE) {
                    has_msr_misc_enable = true;
                    continue;
                }
L
Liu Jinsong 已提交
805 806 807 808
                if (kvm_msr_list->indices[i] == MSR_IA32_BNDCFGS) {
                    has_msr_bndcfgs = true;
                    continue;
                }
A
aliguori 已提交
809 810 811
            }
        }

812
        g_free(kvm_msr_list);
A
aliguori 已提交
813 814
    }

815
    return ret;
A
aliguori 已提交
816 817
}

818
int kvm_arch_init(KVMState *s)
819
{
820
    uint64_t identity_base = 0xfffbc000;
J
Jan Kiszka 已提交
821
    uint64_t shadow_mem;
822
    int ret;
823
    struct utsname utsname;
824

825
    ret = kvm_get_supported_msrs(s);
826 827 828
    if (ret < 0) {
        return ret;
    }
829 830 831 832

    uname(&utsname);
    lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;

J
Jes Sorensen 已提交
833
    /*
834 835 836 837 838 839 840 841 842
     * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
     * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
     * Since these must be part of guest physical memory, we need to allocate
     * them, both by setting their start addresses in the kernel and by
     * creating a corresponding e820 entry. We need 4 pages before the BIOS.
     *
     * Older KVM versions may not support setting the identity map base. In
     * that case we need to stick with the default, i.e. a 256K maximum BIOS
     * size.
J
Jes Sorensen 已提交
843
     */
844 845 846 847 848 849 850 851
    if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
        /* Allows up to 16M BIOSes. */
        identity_base = 0xfeffc000;

        ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
        if (ret < 0) {
            return ret;
        }
J
Jes Sorensen 已提交
852
    }
853

854 855
    /* Set TSS base one page after EPT identity map. */
    ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
856 857 858 859
    if (ret < 0) {
        return ret;
    }

860 861
    /* Tell fw_cfg to notify the BIOS to reserve the range. */
    ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
862
    if (ret < 0) {
863
        fprintf(stderr, "e820_add_entry() table is full\n");
864 865
        return ret;
    }
866
    qemu_register_reset(kvm_unpoison_all, NULL);
867

868 869 870 871 872 873 874
    shadow_mem = qemu_opt_get_size(qemu_get_machine_opts(),
                                   "kvm_shadow_mem", -1);
    if (shadow_mem != -1) {
        shadow_mem /= 4096;
        ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
        if (ret < 0) {
            return ret;
J
Jan Kiszka 已提交
875 876
        }
    }
877
    return 0;
A
aliguori 已提交
878
}
879

A
aliguori 已提交
880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903
static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
{
    lhs->selector = rhs->selector;
    lhs->base = rhs->base;
    lhs->limit = rhs->limit;
    lhs->type = 3;
    lhs->present = 1;
    lhs->dpl = 3;
    lhs->db = 0;
    lhs->s = 1;
    lhs->l = 0;
    lhs->g = 0;
    lhs->avl = 0;
    lhs->unusable = 0;
}

static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
{
    unsigned flags = rhs->flags;
    lhs->selector = rhs->selector;
    lhs->base = rhs->base;
    lhs->limit = rhs->limit;
    lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
    lhs->present = (flags & DESC_P_MASK) != 0;
904
    lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
A
aliguori 已提交
905 906 907 908 909 910
    lhs->db = (flags >> DESC_B_SHIFT) & 1;
    lhs->s = (flags & DESC_S_MASK) != 0;
    lhs->l = (flags >> DESC_L_SHIFT) & 1;
    lhs->g = (flags & DESC_G_MASK) != 0;
    lhs->avl = (flags & DESC_AVL_MASK) != 0;
    lhs->unusable = 0;
911
    lhs->padding = 0;
A
aliguori 已提交
912 913 914 915 916 917 918
}

static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
{
    lhs->selector = rhs->selector;
    lhs->base = rhs->base;
    lhs->limit = rhs->limit;
919 920 921 922 923 924 925 926
    lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
                 (rhs->present * DESC_P_MASK) |
                 (rhs->dpl << DESC_DPL_SHIFT) |
                 (rhs->db << DESC_B_SHIFT) |
                 (rhs->s * DESC_S_MASK) |
                 (rhs->l << DESC_L_SHIFT) |
                 (rhs->g * DESC_G_MASK) |
                 (rhs->avl * DESC_AVL_MASK);
A
aliguori 已提交
927 928 929 930
}

static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
{
931
    if (set) {
A
aliguori 已提交
932
        *kvm_reg = *qemu_reg;
933
    } else {
A
aliguori 已提交
934
        *qemu_reg = *kvm_reg;
935
    }
A
aliguori 已提交
936 937
}

938
static int kvm_getput_regs(X86CPU *cpu, int set)
A
aliguori 已提交
939
{
940
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
941 942 943 944
    struct kvm_regs regs;
    int ret = 0;

    if (!set) {
945
        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
946
        if (ret < 0) {
A
aliguori 已提交
947
            return ret;
948
        }
A
aliguori 已提交
949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972
    }

    kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
    kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
    kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
    kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
    kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
    kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
    kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
    kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
#ifdef TARGET_X86_64
    kvm_getput_reg(&regs.r8, &env->regs[8], set);
    kvm_getput_reg(&regs.r9, &env->regs[9], set);
    kvm_getput_reg(&regs.r10, &env->regs[10], set);
    kvm_getput_reg(&regs.r11, &env->regs[11], set);
    kvm_getput_reg(&regs.r12, &env->regs[12], set);
    kvm_getput_reg(&regs.r13, &env->regs[13], set);
    kvm_getput_reg(&regs.r14, &env->regs[14], set);
    kvm_getput_reg(&regs.r15, &env->regs[15], set);
#endif

    kvm_getput_reg(&regs.rflags, &env->eflags, set);
    kvm_getput_reg(&regs.rip, &env->eip, set);

973
    if (set) {
974
        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
975
    }
A
aliguori 已提交
976 977 978 979

    return ret;
}

980
static int kvm_put_fpu(X86CPU *cpu)
A
aliguori 已提交
981
{
982
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
983 984 985 986 987 988 989
    struct kvm_fpu fpu;
    int i;

    memset(&fpu, 0, sizeof fpu);
    fpu.fsw = env->fpus & ~(7 << 11);
    fpu.fsw |= (env->fpstt & 7) << 11;
    fpu.fcw = env->fpuc;
990 991 992
    fpu.last_opcode = env->fpop;
    fpu.last_ip = env->fpip;
    fpu.last_dp = env->fpdp;
993 994 995
    for (i = 0; i < 8; ++i) {
        fpu.ftwx |= (!env->fptags[i]) << i;
    }
A
aliguori 已提交
996 997 998 999
    memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
    memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs);
    fpu.mxcsr = env->mxcsr;

1000
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, &fpu);
A
aliguori 已提交
1001 1002
}

1003 1004
#define XSAVE_FCW_FSW     0
#define XSAVE_FTW_FOP     1
1005 1006 1007 1008 1009 1010 1011
#define XSAVE_CWD_RIP     2
#define XSAVE_CWD_RDP     4
#define XSAVE_MXCSR       6
#define XSAVE_ST_SPACE    8
#define XSAVE_XMM_SPACE   40
#define XSAVE_XSTATE_BV   128
#define XSAVE_YMMH_SPACE  144
L
Liu Jinsong 已提交
1012 1013
#define XSAVE_BNDREGS     240
#define XSAVE_BNDCSR      256
1014

1015
static int kvm_put_xsave(X86CPU *cpu)
1016
{
1017
    CPUX86State *env = &cpu->env;
1018
    struct kvm_xsave* xsave = env->kvm_xsave_buf;
1019
    uint16_t cwd, swd, twd;
1020
    int i, r;
1021

1022
    if (!kvm_has_xsave()) {
1023
        return kvm_put_fpu(cpu);
1024
    }
1025 1026

    memset(xsave, 0, sizeof(struct kvm_xsave));
B
Blue Swirl 已提交
1027
    twd = 0;
1028 1029 1030
    swd = env->fpus & ~(7 << 11);
    swd |= (env->fpstt & 7) << 11;
    cwd = env->fpuc;
1031
    for (i = 0; i < 8; ++i) {
1032
        twd |= (!env->fptags[i]) << i;
1033
    }
1034 1035
    xsave->region[XSAVE_FCW_FSW] = (uint32_t)(swd << 16) + cwd;
    xsave->region[XSAVE_FTW_FOP] = (uint32_t)(env->fpop << 16) + twd;
1036 1037
    memcpy(&xsave->region[XSAVE_CWD_RIP], &env->fpip, sizeof(env->fpip));
    memcpy(&xsave->region[XSAVE_CWD_RDP], &env->fpdp, sizeof(env->fpdp));
1038 1039 1040 1041 1042 1043 1044 1045
    memcpy(&xsave->region[XSAVE_ST_SPACE], env->fpregs,
            sizeof env->fpregs);
    memcpy(&xsave->region[XSAVE_XMM_SPACE], env->xmm_regs,
            sizeof env->xmm_regs);
    xsave->region[XSAVE_MXCSR] = env->mxcsr;
    *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV] = env->xstate_bv;
    memcpy(&xsave->region[XSAVE_YMMH_SPACE], env->ymmh_regs,
            sizeof env->ymmh_regs);
L
Liu Jinsong 已提交
1046 1047 1048 1049
    memcpy(&xsave->region[XSAVE_BNDREGS], env->bnd_regs,
            sizeof env->bnd_regs);
    memcpy(&xsave->region[XSAVE_BNDCSR], &env->bndcs_regs,
            sizeof(env->bndcs_regs));
1050
    r = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
1051
    return r;
1052 1053
}

1054
static int kvm_put_xcrs(X86CPU *cpu)
1055
{
1056
    CPUX86State *env = &cpu->env;
1057 1058
    struct kvm_xcrs xcrs;

1059
    if (!kvm_has_xcrs()) {
1060
        return 0;
1061
    }
1062 1063 1064 1065 1066

    xcrs.nr_xcrs = 1;
    xcrs.flags = 0;
    xcrs.xcrs[0].xcr = 0;
    xcrs.xcrs[0].value = env->xcr0;
1067
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
1068 1069
}

1070
static int kvm_put_sregs(X86CPU *cpu)
A
aliguori 已提交
1071
{
1072
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
1073 1074
    struct kvm_sregs sregs;

1075 1076 1077 1078 1079
    memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
    if (env->interrupt_injected >= 0) {
        sregs.interrupt_bitmap[env->interrupt_injected / 64] |=
                (uint64_t)1 << (env->interrupt_injected % 64);
    }
A
aliguori 已提交
1080 1081

    if ((env->eflags & VM_MASK)) {
1082 1083 1084 1085 1086 1087
        set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
        set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
        set_v8086_seg(&sregs.es, &env->segs[R_ES]);
        set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
        set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
        set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
A
aliguori 已提交
1088
    } else {
1089 1090 1091 1092 1093 1094
        set_seg(&sregs.cs, &env->segs[R_CS]);
        set_seg(&sregs.ds, &env->segs[R_DS]);
        set_seg(&sregs.es, &env->segs[R_ES]);
        set_seg(&sregs.fs, &env->segs[R_FS]);
        set_seg(&sregs.gs, &env->segs[R_GS]);
        set_seg(&sregs.ss, &env->segs[R_SS]);
A
aliguori 已提交
1095 1096 1097 1098 1099 1100 1101
    }

    set_seg(&sregs.tr, &env->tr);
    set_seg(&sregs.ldt, &env->ldt);

    sregs.idt.limit = env->idt.limit;
    sregs.idt.base = env->idt.base;
1102
    memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
A
aliguori 已提交
1103 1104
    sregs.gdt.limit = env->gdt.limit;
    sregs.gdt.base = env->gdt.base;
1105
    memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
A
aliguori 已提交
1106 1107 1108 1109 1110 1111

    sregs.cr0 = env->cr[0];
    sregs.cr2 = env->cr[2];
    sregs.cr3 = env->cr[3];
    sregs.cr4 = env->cr[4];

1112 1113
    sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
    sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
A
aliguori 已提交
1114 1115 1116

    sregs.efer = env->efer;

1117
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
A
aliguori 已提交
1118 1119 1120 1121 1122 1123 1124 1125 1126
}

static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
                              uint32_t index, uint64_t value)
{
    entry->index = index;
    entry->data = value;
}

1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146
static int kvm_put_tscdeadline_msr(X86CPU *cpu)
{
    CPUX86State *env = &cpu->env;
    struct {
        struct kvm_msrs info;
        struct kvm_msr_entry entries[1];
    } msr_data;
    struct kvm_msr_entry *msrs = msr_data.entries;

    if (!has_msr_tsc_deadline) {
        return 0;
    }

    kvm_msr_entry_set(&msrs[0], MSR_IA32_TSCDEADLINE, env->tsc_deadline);

    msr_data.info.nmsrs = 1;

    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
}

1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165
/*
 * Provide a separate write service for the feature control MSR in order to
 * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
 * before writing any other state because forcibly leaving nested mode
 * invalidates the VCPU state.
 */
static int kvm_put_msr_feature_control(X86CPU *cpu)
{
    struct {
        struct kvm_msrs info;
        struct kvm_msr_entry entry;
    } msr_data;

    kvm_msr_entry_set(&msr_data.entry, MSR_IA32_FEATURE_CONTROL,
                      cpu->env.msr_ia32_feature_control);
    msr_data.info.nmsrs = 1;
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
}

1166
static int kvm_put_msrs(X86CPU *cpu, int level)
A
aliguori 已提交
1167
{
1168
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
1169 1170 1171 1172 1173
    struct {
        struct kvm_msrs info;
        struct kvm_msr_entry entries[100];
    } msr_data;
    struct kvm_msr_entry *msrs = msr_data.entries;
P
Paolo Bonzini 已提交
1174
    int n = 0, i;
A
aliguori 已提交
1175 1176 1177 1178

    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
1179
    kvm_msr_entry_set(&msrs[n++], MSR_PAT, env->pat);
1180
    if (has_msr_star) {
1181 1182
        kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star);
    }
1183
    if (has_msr_hsave_pa) {
M
Marcelo Tosatti 已提交
1184
        kvm_msr_entry_set(&msrs[n++], MSR_VM_HSAVE_PA, env->vm_hsave);
1185
    }
1186 1187 1188
    if (has_msr_tsc_adjust) {
        kvm_msr_entry_set(&msrs[n++], MSR_TSC_ADJUST, env->tsc_adjust);
    }
A
Avi Kivity 已提交
1189 1190 1191 1192
    if (has_msr_misc_enable) {
        kvm_msr_entry_set(&msrs[n++], MSR_IA32_MISC_ENABLE,
                          env->msr_ia32_misc_enable);
    }
1193 1194 1195
    if (has_msr_bndcfgs) {
        kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
    }
A
aliguori 已提交
1196
#ifdef TARGET_X86_64
1197 1198 1199 1200 1201 1202
    if (lm_capable_kernel) {
        kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
        kvm_msr_entry_set(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
        kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask);
        kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
    }
A
aliguori 已提交
1203
#endif
J
Jan Kiszka 已提交
1204
    /*
P
Paolo Bonzini 已提交
1205 1206
     * The following MSRs have side effects on the guest or are too heavy
     * for normal writeback. Limit them to reset or full state updates.
J
Jan Kiszka 已提交
1207 1208
     */
    if (level >= KVM_PUT_RESET_STATE) {
1209
        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
1210 1211 1212
        kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
                          env->system_time_msr);
        kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
1213 1214 1215 1216
        if (has_msr_async_pf_en) {
            kvm_msr_entry_set(&msrs[n++], MSR_KVM_ASYNC_PF_EN,
                              env->async_pf_en_msr);
        }
M
Michael S. Tsirkin 已提交
1217 1218 1219 1220
        if (has_msr_pv_eoi_en) {
            kvm_msr_entry_set(&msrs[n++], MSR_KVM_PV_EOI_EN,
                              env->pv_eoi_en_msr);
        }
1221 1222 1223 1224
        if (has_msr_kvm_steal_time) {
            kvm_msr_entry_set(&msrs[n++], MSR_KVM_STEAL_TIME,
                              env->steal_time_msr);
        }
P
Paolo Bonzini 已提交
1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251
        if (has_msr_architectural_pmu) {
            /* Stop the counter.  */
            kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
            kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_GLOBAL_CTRL, 0);

            /* Set the counter values.  */
            for (i = 0; i < MAX_FIXED_COUNTERS; i++) {
                kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_FIXED_CTR0 + i,
                                  env->msr_fixed_counters[i]);
            }
            for (i = 0; i < num_architectural_pmu_counters; i++) {
                kvm_msr_entry_set(&msrs[n++], MSR_P6_PERFCTR0 + i,
                                  env->msr_gp_counters[i]);
                kvm_msr_entry_set(&msrs[n++], MSR_P6_EVNTSEL0 + i,
                                  env->msr_gp_evtsel[i]);
            }
            kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_GLOBAL_STATUS,
                              env->msr_global_status);
            kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_GLOBAL_OVF_CTRL,
                              env->msr_global_ovf_ctrl);

            /* Now start the PMU.  */
            kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_FIXED_CTR_CTRL,
                              env->msr_fixed_ctr_ctrl);
            kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_GLOBAL_CTRL,
                              env->msr_global_ctrl);
        }
1252
        if (has_msr_hv_hypercall) {
1253 1254 1255 1256
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_GUEST_OS_ID,
                              env->msr_hv_guest_os_id);
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_HYPERCALL,
                              env->msr_hv_hypercall);
1257
        }
1258
        if (has_msr_hv_vapic) {
1259 1260
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_APIC_ASSIST_PAGE,
                              env->msr_hv_vapic);
1261
        }
1262 1263 1264 1265
        if (has_msr_hv_tsc) {
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_REFERENCE_TSC,
                              env->msr_hv_tsc);
        }
1266 1267 1268

        /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
         *       kvm_put_msr_feature_control. */
1269
    }
1270
    if (env->mcg_cap) {
H
Hidetoshi Seto 已提交
1271
        int i;
1272

1273 1274 1275 1276
        kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
        kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
            kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
1277 1278
        }
    }
1279

A
aliguori 已提交
1280 1281
    msr_data.info.nmsrs = n;

1282
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
A
aliguori 已提交
1283 1284 1285 1286

}


1287
static int kvm_get_fpu(X86CPU *cpu)
A
aliguori 已提交
1288
{
1289
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
1290 1291 1292
    struct kvm_fpu fpu;
    int i, ret;

1293
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_FPU, &fpu);
1294
    if (ret < 0) {
A
aliguori 已提交
1295
        return ret;
1296
    }
A
aliguori 已提交
1297 1298 1299 1300

    env->fpstt = (fpu.fsw >> 11) & 7;
    env->fpus = fpu.fsw;
    env->fpuc = fpu.fcw;
1301 1302 1303
    env->fpop = fpu.last_opcode;
    env->fpip = fpu.last_ip;
    env->fpdp = fpu.last_dp;
1304 1305 1306
    for (i = 0; i < 8; ++i) {
        env->fptags[i] = !((fpu.ftwx >> i) & 1);
    }
A
aliguori 已提交
1307 1308 1309 1310 1311 1312 1313
    memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
    memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs);
    env->mxcsr = fpu.mxcsr;

    return 0;
}

1314
static int kvm_get_xsave(X86CPU *cpu)
1315
{
1316
    CPUX86State *env = &cpu->env;
1317
    struct kvm_xsave* xsave = env->kvm_xsave_buf;
1318
    int ret, i;
1319
    uint16_t cwd, swd, twd;
1320

1321
    if (!kvm_has_xsave()) {
1322
        return kvm_get_fpu(cpu);
1323
    }
1324

1325
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XSAVE, xsave);
1326
    if (ret < 0) {
1327
        return ret;
1328
    }
1329

1330 1331 1332 1333
    cwd = (uint16_t)xsave->region[XSAVE_FCW_FSW];
    swd = (uint16_t)(xsave->region[XSAVE_FCW_FSW] >> 16);
    twd = (uint16_t)xsave->region[XSAVE_FTW_FOP];
    env->fpop = (uint16_t)(xsave->region[XSAVE_FTW_FOP] >> 16);
1334 1335 1336
    env->fpstt = (swd >> 11) & 7;
    env->fpus = swd;
    env->fpuc = cwd;
1337
    for (i = 0; i < 8; ++i) {
1338
        env->fptags[i] = !((twd >> i) & 1);
1339
    }
1340 1341
    memcpy(&env->fpip, &xsave->region[XSAVE_CWD_RIP], sizeof(env->fpip));
    memcpy(&env->fpdp, &xsave->region[XSAVE_CWD_RDP], sizeof(env->fpdp));
1342 1343 1344 1345 1346 1347 1348 1349
    env->mxcsr = xsave->region[XSAVE_MXCSR];
    memcpy(env->fpregs, &xsave->region[XSAVE_ST_SPACE],
            sizeof env->fpregs);
    memcpy(env->xmm_regs, &xsave->region[XSAVE_XMM_SPACE],
            sizeof env->xmm_regs);
    env->xstate_bv = *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV];
    memcpy(env->ymmh_regs, &xsave->region[XSAVE_YMMH_SPACE],
            sizeof env->ymmh_regs);
L
Liu Jinsong 已提交
1350 1351 1352 1353
    memcpy(env->bnd_regs, &xsave->region[XSAVE_BNDREGS],
            sizeof env->bnd_regs);
    memcpy(&env->bndcs_regs, &xsave->region[XSAVE_BNDCSR],
            sizeof(env->bndcs_regs));
1354 1355 1356
    return 0;
}

1357
static int kvm_get_xcrs(X86CPU *cpu)
1358
{
1359
    CPUX86State *env = &cpu->env;
1360 1361 1362
    int i, ret;
    struct kvm_xcrs xcrs;

1363
    if (!kvm_has_xcrs()) {
1364
        return 0;
1365
    }
1366

1367
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
1368
    if (ret < 0) {
1369
        return ret;
1370
    }
1371

1372
    for (i = 0; i < xcrs.nr_xcrs; i++) {
1373
        /* Only support xcr0 now */
P
Paolo Bonzini 已提交
1374 1375
        if (xcrs.xcrs[i].xcr == 0) {
            env->xcr0 = xcrs.xcrs[i].value;
1376 1377
            break;
        }
1378
    }
1379 1380 1381
    return 0;
}

1382
static int kvm_get_sregs(X86CPU *cpu)
A
aliguori 已提交
1383
{
1384
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
1385 1386
    struct kvm_sregs sregs;
    uint32_t hflags;
1387
    int bit, i, ret;
A
aliguori 已提交
1388

1389
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
1390
    if (ret < 0) {
A
aliguori 已提交
1391
        return ret;
1392
    }
A
aliguori 已提交
1393

1394 1395 1396 1397 1398 1399 1400 1401 1402 1403
    /* There can only be one pending IRQ set in the bitmap at a time, so try
       to find it and save its number instead (-1 for none). */
    env->interrupt_injected = -1;
    for (i = 0; i < ARRAY_SIZE(sregs.interrupt_bitmap); i++) {
        if (sregs.interrupt_bitmap[i]) {
            bit = ctz64(sregs.interrupt_bitmap[i]);
            env->interrupt_injected = i * 64 + bit;
            break;
        }
    }
A
aliguori 已提交
1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425

    get_seg(&env->segs[R_CS], &sregs.cs);
    get_seg(&env->segs[R_DS], &sregs.ds);
    get_seg(&env->segs[R_ES], &sregs.es);
    get_seg(&env->segs[R_FS], &sregs.fs);
    get_seg(&env->segs[R_GS], &sregs.gs);
    get_seg(&env->segs[R_SS], &sregs.ss);

    get_seg(&env->tr, &sregs.tr);
    get_seg(&env->ldt, &sregs.ldt);

    env->idt.limit = sregs.idt.limit;
    env->idt.base = sregs.idt.base;
    env->gdt.limit = sregs.gdt.limit;
    env->gdt.base = sregs.gdt.base;

    env->cr[0] = sregs.cr0;
    env->cr[2] = sregs.cr2;
    env->cr[3] = sregs.cr3;
    env->cr[4] = sregs.cr4;

    env->efer = sregs.efer;
1426 1427

    /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
A
aliguori 已提交
1428

1429 1430 1431 1432 1433
#define HFLAG_COPY_MASK \
    ~( HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
       HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
       HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
       HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
A
aliguori 已提交
1434

P
Paolo Bonzini 已提交
1435
    hflags = (env->segs[R_SS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
A
aliguori 已提交
1436 1437
    hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
    hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
1438
                (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
A
aliguori 已提交
1439 1440
    hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
    hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
1441
                (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
A
aliguori 已提交
1442 1443 1444 1445 1446 1447 1448 1449 1450

    if (env->efer & MSR_EFER_LMA) {
        hflags |= HF_LMA_MASK;
    }

    if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
        hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
    } else {
        hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
1451
                    (DESC_B_SHIFT - HF_CS32_SHIFT);
A
aliguori 已提交
1452
        hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
1453 1454 1455 1456 1457 1458 1459 1460
                    (DESC_B_SHIFT - HF_SS32_SHIFT);
        if (!(env->cr[0] & CR0_PE_MASK) || (env->eflags & VM_MASK) ||
            !(hflags & HF_CS32_MASK)) {
            hflags |= HF_ADDSEG_MASK;
        } else {
            hflags |= ((env->segs[R_DS].base | env->segs[R_ES].base |
                        env->segs[R_SS].base) != 0) << HF_ADDSEG_SHIFT;
        }
A
aliguori 已提交
1461 1462 1463 1464 1465 1466
    }
    env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;

    return 0;
}

1467
static int kvm_get_msrs(X86CPU *cpu)
A
aliguori 已提交
1468
{
1469
    CPUX86State *env = &cpu->env;
A
aliguori 已提交
1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480
    struct {
        struct kvm_msrs info;
        struct kvm_msr_entry entries[100];
    } msr_data;
    struct kvm_msr_entry *msrs = msr_data.entries;
    int ret, i, n;

    n = 0;
    msrs[n++].index = MSR_IA32_SYSENTER_CS;
    msrs[n++].index = MSR_IA32_SYSENTER_ESP;
    msrs[n++].index = MSR_IA32_SYSENTER_EIP;
1481
    msrs[n++].index = MSR_PAT;
1482
    if (has_msr_star) {
1483 1484
        msrs[n++].index = MSR_STAR;
    }
1485
    if (has_msr_hsave_pa) {
M
Marcelo Tosatti 已提交
1486
        msrs[n++].index = MSR_VM_HSAVE_PA;
1487
    }
1488 1489 1490
    if (has_msr_tsc_adjust) {
        msrs[n++].index = MSR_TSC_ADJUST;
    }
1491 1492 1493
    if (has_msr_tsc_deadline) {
        msrs[n++].index = MSR_IA32_TSCDEADLINE;
    }
A
Avi Kivity 已提交
1494 1495 1496
    if (has_msr_misc_enable) {
        msrs[n++].index = MSR_IA32_MISC_ENABLE;
    }
1497 1498 1499
    if (has_msr_feature_control) {
        msrs[n++].index = MSR_IA32_FEATURE_CONTROL;
    }
L
Liu Jinsong 已提交
1500 1501 1502
    if (has_msr_bndcfgs) {
        msrs[n++].index = MSR_IA32_BNDCFGS;
    }
1503 1504 1505

    if (!env->tsc_valid) {
        msrs[n++].index = MSR_IA32_TSC;
1506
        env->tsc_valid = !runstate_is_running();
1507 1508
    }

A
aliguori 已提交
1509
#ifdef TARGET_X86_64
1510 1511 1512 1513 1514 1515
    if (lm_capable_kernel) {
        msrs[n++].index = MSR_CSTAR;
        msrs[n++].index = MSR_KERNELGSBASE;
        msrs[n++].index = MSR_FMASK;
        msrs[n++].index = MSR_LSTAR;
    }
A
aliguori 已提交
1516
#endif
1517 1518
    msrs[n++].index = MSR_KVM_SYSTEM_TIME;
    msrs[n++].index = MSR_KVM_WALL_CLOCK;
1519 1520 1521
    if (has_msr_async_pf_en) {
        msrs[n++].index = MSR_KVM_ASYNC_PF_EN;
    }
M
Michael S. Tsirkin 已提交
1522 1523 1524
    if (has_msr_pv_eoi_en) {
        msrs[n++].index = MSR_KVM_PV_EOI_EN;
    }
1525 1526 1527
    if (has_msr_kvm_steal_time) {
        msrs[n++].index = MSR_KVM_STEAL_TIME;
    }
P
Paolo Bonzini 已提交
1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540
    if (has_msr_architectural_pmu) {
        msrs[n++].index = MSR_CORE_PERF_FIXED_CTR_CTRL;
        msrs[n++].index = MSR_CORE_PERF_GLOBAL_CTRL;
        msrs[n++].index = MSR_CORE_PERF_GLOBAL_STATUS;
        msrs[n++].index = MSR_CORE_PERF_GLOBAL_OVF_CTRL;
        for (i = 0; i < MAX_FIXED_COUNTERS; i++) {
            msrs[n++].index = MSR_CORE_PERF_FIXED_CTR0 + i;
        }
        for (i = 0; i < num_architectural_pmu_counters; i++) {
            msrs[n++].index = MSR_P6_PERFCTR0 + i;
            msrs[n++].index = MSR_P6_EVNTSEL0 + i;
        }
    }
1541

1542 1543 1544
    if (env->mcg_cap) {
        msrs[n++].index = MSR_MCG_STATUS;
        msrs[n++].index = MSR_MCG_CTL;
1545
        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
1546
            msrs[n++].index = MSR_MC0_CTL + i;
1547
        }
1548 1549
    }

1550 1551 1552 1553
    if (has_msr_hv_hypercall) {
        msrs[n++].index = HV_X64_MSR_HYPERCALL;
        msrs[n++].index = HV_X64_MSR_GUEST_OS_ID;
    }
1554 1555 1556
    if (has_msr_hv_vapic) {
        msrs[n++].index = HV_X64_MSR_APIC_ASSIST_PAGE;
    }
1557 1558 1559
    if (has_msr_hv_tsc) {
        msrs[n++].index = HV_X64_MSR_REFERENCE_TSC;
    }
1560

A
aliguori 已提交
1561
    msr_data.info.nmsrs = n;
1562
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
1563
    if (ret < 0) {
A
aliguori 已提交
1564
        return ret;
1565
    }
A
aliguori 已提交
1566 1567

    for (i = 0; i < ret; i++) {
P
Paolo Bonzini 已提交
1568 1569
        uint32_t index = msrs[i].index;
        switch (index) {
A
aliguori 已提交
1570 1571 1572 1573 1574 1575 1576 1577 1578
        case MSR_IA32_SYSENTER_CS:
            env->sysenter_cs = msrs[i].data;
            break;
        case MSR_IA32_SYSENTER_ESP:
            env->sysenter_esp = msrs[i].data;
            break;
        case MSR_IA32_SYSENTER_EIP:
            env->sysenter_eip = msrs[i].data;
            break;
1579 1580 1581
        case MSR_PAT:
            env->pat = msrs[i].data;
            break;
A
aliguori 已提交
1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601
        case MSR_STAR:
            env->star = msrs[i].data;
            break;
#ifdef TARGET_X86_64
        case MSR_CSTAR:
            env->cstar = msrs[i].data;
            break;
        case MSR_KERNELGSBASE:
            env->kernelgsbase = msrs[i].data;
            break;
        case MSR_FMASK:
            env->fmask = msrs[i].data;
            break;
        case MSR_LSTAR:
            env->lstar = msrs[i].data;
            break;
#endif
        case MSR_IA32_TSC:
            env->tsc = msrs[i].data;
            break;
1602 1603 1604
        case MSR_TSC_ADJUST:
            env->tsc_adjust = msrs[i].data;
            break;
1605 1606 1607
        case MSR_IA32_TSCDEADLINE:
            env->tsc_deadline = msrs[i].data;
            break;
1608 1609 1610
        case MSR_VM_HSAVE_PA:
            env->vm_hsave = msrs[i].data;
            break;
1611 1612 1613 1614 1615 1616
        case MSR_KVM_SYSTEM_TIME:
            env->system_time_msr = msrs[i].data;
            break;
        case MSR_KVM_WALL_CLOCK:
            env->wall_clock_msr = msrs[i].data;
            break;
1617 1618 1619 1620 1621 1622
        case MSR_MCG_STATUS:
            env->mcg_status = msrs[i].data;
            break;
        case MSR_MCG_CTL:
            env->mcg_ctl = msrs[i].data;
            break;
A
Avi Kivity 已提交
1623 1624 1625
        case MSR_IA32_MISC_ENABLE:
            env->msr_ia32_misc_enable = msrs[i].data;
            break;
1626 1627
        case MSR_IA32_FEATURE_CONTROL:
            env->msr_ia32_feature_control = msrs[i].data;
1628
            break;
L
Liu Jinsong 已提交
1629 1630 1631
        case MSR_IA32_BNDCFGS:
            env->msr_bndcfgs = msrs[i].data;
            break;
1632 1633 1634 1635 1636
        default:
            if (msrs[i].index >= MSR_MC0_CTL &&
                msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
                env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
            }
H
Hidetoshi Seto 已提交
1637
            break;
1638 1639 1640
        case MSR_KVM_ASYNC_PF_EN:
            env->async_pf_en_msr = msrs[i].data;
            break;
M
Michael S. Tsirkin 已提交
1641 1642 1643
        case MSR_KVM_PV_EOI_EN:
            env->pv_eoi_en_msr = msrs[i].data;
            break;
1644 1645 1646
        case MSR_KVM_STEAL_TIME:
            env->steal_time_msr = msrs[i].data;
            break;
P
Paolo Bonzini 已提交
1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667
        case MSR_CORE_PERF_FIXED_CTR_CTRL:
            env->msr_fixed_ctr_ctrl = msrs[i].data;
            break;
        case MSR_CORE_PERF_GLOBAL_CTRL:
            env->msr_global_ctrl = msrs[i].data;
            break;
        case MSR_CORE_PERF_GLOBAL_STATUS:
            env->msr_global_status = msrs[i].data;
            break;
        case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
            env->msr_global_ovf_ctrl = msrs[i].data;
            break;
        case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1:
            env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data;
            break;
        case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1:
            env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data;
            break;
        case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
            env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
            break;
1668 1669 1670 1671 1672 1673
        case HV_X64_MSR_HYPERCALL:
            env->msr_hv_hypercall = msrs[i].data;
            break;
        case HV_X64_MSR_GUEST_OS_ID:
            env->msr_hv_guest_os_id = msrs[i].data;
            break;
1674 1675 1676
        case HV_X64_MSR_APIC_ASSIST_PAGE:
            env->msr_hv_vapic = msrs[i].data;
            break;
1677 1678 1679
        case HV_X64_MSR_REFERENCE_TSC:
            env->msr_hv_tsc = msrs[i].data;
            break;
A
aliguori 已提交
1680 1681 1682 1683 1684 1685
        }
    }

    return 0;
}

1686
static int kvm_put_mp_state(X86CPU *cpu)
1687
{
1688
    struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
1689

1690
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
1691 1692
}

1693
static int kvm_get_mp_state(X86CPU *cpu)
1694
{
1695
    CPUState *cs = CPU(cpu);
1696
    CPUX86State *env = &cpu->env;
1697 1698 1699
    struct kvm_mp_state mp_state;
    int ret;

1700
    ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
1701 1702 1703 1704
    if (ret < 0) {
        return ret;
    }
    env->mp_state = mp_state.mp_state;
1705
    if (kvm_irqchip_in_kernel()) {
1706
        cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
1707
    }
1708 1709 1710
    return 0;
}

1711
static int kvm_get_apic(X86CPU *cpu)
1712
{
1713
    DeviceState *apic = cpu->apic_state;
1714 1715 1716
    struct kvm_lapic_state kapic;
    int ret;

1717
    if (apic && kvm_irqchip_in_kernel()) {
1718
        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
1719 1720 1721 1722 1723 1724 1725 1726 1727
        if (ret < 0) {
            return ret;
        }

        kvm_get_apic_state(apic, &kapic);
    }
    return 0;
}

1728
static int kvm_put_apic(X86CPU *cpu)
1729
{
1730
    DeviceState *apic = cpu->apic_state;
1731 1732
    struct kvm_lapic_state kapic;

1733
    if (apic && kvm_irqchip_in_kernel()) {
1734 1735
        kvm_put_apic_state(apic, &kapic);

1736
        return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_LAPIC, &kapic);
1737 1738 1739 1740
    }
    return 0;
}

1741
static int kvm_put_vcpu_events(X86CPU *cpu, int level)
1742
{
1743
    CPUX86State *env = &cpu->env;
1744 1745 1746 1747 1748 1749
    struct kvm_vcpu_events events;

    if (!kvm_has_vcpu_events()) {
        return 0;
    }

1750 1751
    events.exception.injected = (env->exception_injected >= 0);
    events.exception.nr = env->exception_injected;
1752 1753
    events.exception.has_error_code = env->has_error_code;
    events.exception.error_code = env->error_code;
1754
    events.exception.pad = 0;
1755 1756 1757 1758 1759 1760 1761 1762

    events.interrupt.injected = (env->interrupt_injected >= 0);
    events.interrupt.nr = env->interrupt_injected;
    events.interrupt.soft = env->soft_interrupt;

    events.nmi.injected = env->nmi_injected;
    events.nmi.pending = env->nmi_pending;
    events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
1763
    events.nmi.pad = 0;
1764 1765 1766

    events.sipi_vector = env->sipi_vector;

1767 1768 1769 1770 1771
    events.flags = 0;
    if (level >= KVM_PUT_RESET_STATE) {
        events.flags |=
            KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
    }
1772

1773
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
1774 1775
}

1776
static int kvm_get_vcpu_events(X86CPU *cpu)
1777
{
1778
    CPUX86State *env = &cpu->env;
1779 1780 1781 1782 1783 1784 1785
    struct kvm_vcpu_events events;
    int ret;

    if (!kvm_has_vcpu_events()) {
        return 0;
    }

1786
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
1787 1788 1789
    if (ret < 0) {
       return ret;
    }
1790
    env->exception_injected =
1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811
       events.exception.injected ? events.exception.nr : -1;
    env->has_error_code = events.exception.has_error_code;
    env->error_code = events.exception.error_code;

    env->interrupt_injected =
        events.interrupt.injected ? events.interrupt.nr : -1;
    env->soft_interrupt = events.interrupt.soft;

    env->nmi_injected = events.nmi.injected;
    env->nmi_pending = events.nmi.pending;
    if (events.nmi.masked) {
        env->hflags2 |= HF2_NMI_MASK;
    } else {
        env->hflags2 &= ~HF2_NMI_MASK;
    }

    env->sipi_vector = events.sipi_vector;

    return 0;
}

1812
static int kvm_guest_debug_workarounds(X86CPU *cpu)
1813
{
1814
    CPUState *cs = CPU(cpu);
1815
    CPUX86State *env = &cpu->env;
1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836
    int ret = 0;
    unsigned long reinject_trap = 0;

    if (!kvm_has_vcpu_events()) {
        if (env->exception_injected == 1) {
            reinject_trap = KVM_GUESTDBG_INJECT_DB;
        } else if (env->exception_injected == 3) {
            reinject_trap = KVM_GUESTDBG_INJECT_BP;
        }
        env->exception_injected = -1;
    }

    /*
     * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
     * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
     * by updating the debug state once again if single-stepping is on.
     * Another reason to call kvm_update_guest_debug here is a pending debug
     * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
     * reinject them via SET_GUEST_DEBUG.
     */
    if (reinject_trap ||
1837
        (!kvm_has_robust_singlestep() && cs->singlestep_enabled)) {
1838
        ret = kvm_update_guest_debug(cs, reinject_trap);
1839 1840 1841 1842
    }
    return ret;
}

1843
static int kvm_put_debugregs(X86CPU *cpu)
1844
{
1845
    CPUX86State *env = &cpu->env;
1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859
    struct kvm_debugregs dbgregs;
    int i;

    if (!kvm_has_debugregs()) {
        return 0;
    }

    for (i = 0; i < 4; i++) {
        dbgregs.db[i] = env->dr[i];
    }
    dbgregs.dr6 = env->dr[6];
    dbgregs.dr7 = env->dr[7];
    dbgregs.flags = 0;

1860
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs);
1861 1862
}

1863
static int kvm_get_debugregs(X86CPU *cpu)
1864
{
1865
    CPUX86State *env = &cpu->env;
1866 1867 1868 1869 1870 1871 1872
    struct kvm_debugregs dbgregs;
    int i, ret;

    if (!kvm_has_debugregs()) {
        return 0;
    }

1873
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs);
1874
    if (ret < 0) {
1875
        return ret;
1876 1877 1878 1879 1880 1881 1882 1883 1884 1885
    }
    for (i = 0; i < 4; i++) {
        env->dr[i] = dbgregs.db[i];
    }
    env->dr[4] = env->dr[6] = dbgregs.dr6;
    env->dr[5] = env->dr[7] = dbgregs.dr7;

    return 0;
}

A
Andreas Färber 已提交
1886
int kvm_arch_put_registers(CPUState *cpu, int level)
A
aliguori 已提交
1887
{
A
Andreas Färber 已提交
1888
    X86CPU *x86_cpu = X86_CPU(cpu);
A
aliguori 已提交
1889 1890
    int ret;

1891
    assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
1892

1893 1894 1895 1896 1897 1898 1899
    if (level >= KVM_PUT_RESET_STATE && has_msr_feature_control) {
        ret = kvm_put_msr_feature_control(x86_cpu);
        if (ret < 0) {
            return ret;
        }
    }

1900
    ret = kvm_getput_regs(x86_cpu, 1);
1901
    if (ret < 0) {
A
aliguori 已提交
1902
        return ret;
1903
    }
1904
    ret = kvm_put_xsave(x86_cpu);
1905
    if (ret < 0) {
1906
        return ret;
1907
    }
1908
    ret = kvm_put_xcrs(x86_cpu);
1909
    if (ret < 0) {
A
aliguori 已提交
1910
        return ret;
1911
    }
1912
    ret = kvm_put_sregs(x86_cpu);
1913
    if (ret < 0) {
A
aliguori 已提交
1914
        return ret;
1915
    }
1916
    /* must be before kvm_put_msrs */
1917
    ret = kvm_inject_mce_oldstyle(x86_cpu);
1918 1919 1920
    if (ret < 0) {
        return ret;
    }
1921
    ret = kvm_put_msrs(x86_cpu, level);
1922
    if (ret < 0) {
A
aliguori 已提交
1923
        return ret;
1924
    }
1925
    if (level >= KVM_PUT_RESET_STATE) {
1926
        ret = kvm_put_mp_state(x86_cpu);
1927
        if (ret < 0) {
1928
            return ret;
1929
        }
1930
        ret = kvm_put_apic(x86_cpu);
1931 1932 1933
        if (ret < 0) {
            return ret;
        }
1934
    }
1935 1936 1937 1938 1939 1940

    ret = kvm_put_tscdeadline_msr(x86_cpu);
    if (ret < 0) {
        return ret;
    }

1941
    ret = kvm_put_vcpu_events(x86_cpu, level);
1942
    if (ret < 0) {
1943
        return ret;
1944
    }
1945
    ret = kvm_put_debugregs(x86_cpu);
1946
    if (ret < 0) {
1947
        return ret;
1948
    }
1949
    /* must be last */
1950
    ret = kvm_guest_debug_workarounds(x86_cpu);
1951
    if (ret < 0) {
1952
        return ret;
1953
    }
A
aliguori 已提交
1954 1955 1956
    return 0;
}

A
Andreas Färber 已提交
1957
int kvm_arch_get_registers(CPUState *cs)
A
aliguori 已提交
1958
{
A
Andreas Färber 已提交
1959
    X86CPU *cpu = X86_CPU(cs);
A
aliguori 已提交
1960 1961
    int ret;

A
Andreas Färber 已提交
1962
    assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs));
1963

1964
    ret = kvm_getput_regs(cpu, 0);
1965
    if (ret < 0) {
A
aliguori 已提交
1966
        return ret;
1967
    }
1968
    ret = kvm_get_xsave(cpu);
1969
    if (ret < 0) {
1970
        return ret;
1971
    }
1972
    ret = kvm_get_xcrs(cpu);
1973
    if (ret < 0) {
A
aliguori 已提交
1974
        return ret;
1975
    }
1976
    ret = kvm_get_sregs(cpu);
1977
    if (ret < 0) {
A
aliguori 已提交
1978
        return ret;
1979
    }
1980
    ret = kvm_get_msrs(cpu);
1981
    if (ret < 0) {
A
aliguori 已提交
1982
        return ret;
1983
    }
1984
    ret = kvm_get_mp_state(cpu);
1985
    if (ret < 0) {
1986
        return ret;
1987
    }
1988
    ret = kvm_get_apic(cpu);
1989 1990 1991
    if (ret < 0) {
        return ret;
    }
1992
    ret = kvm_get_vcpu_events(cpu);
1993
    if (ret < 0) {
1994
        return ret;
1995
    }
1996
    ret = kvm_get_debugregs(cpu);
1997
    if (ret < 0) {
1998
        return ret;
1999
    }
A
aliguori 已提交
2000 2001 2002
    return 0;
}

A
Andreas Färber 已提交
2003
void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
A
aliguori 已提交
2004
{
A
Andreas Färber 已提交
2005 2006
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;
2007 2008
    int ret;

2009
    /* Inject NMI */
2010 2011
    if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
        cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
2012
        DPRINTF("injected NMI\n");
2013
        ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
2014 2015 2016 2017
        if (ret < 0) {
            fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
                    strerror(-ret));
        }
2018 2019
    }

2020 2021 2022 2023 2024 2025 2026
    /* Force the VCPU out of its inner loop to process any INIT requests
     * or (for userspace APIC, but it is cheap to combine the checks here)
     * pending TPR access reports.
     */
    if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
        cpu->exit_request = 1;
    }
A
aliguori 已提交
2027

2028
    if (!kvm_irqchip_in_kernel()) {
2029 2030
        /* Try to inject an interrupt if the guest can accept it */
        if (run->ready_for_interrupt_injection &&
2031
            (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
2032 2033 2034
            (env->eflags & IF_MASK)) {
            int irq;

2035
            cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
2036 2037 2038 2039 2040 2041
            irq = cpu_get_pic_interrupt(env);
            if (irq >= 0) {
                struct kvm_interrupt intr;

                intr.irq = irq;
                DPRINTF("injected interrupt %d\n", irq);
2042
                ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr);
2043 2044 2045 2046 2047
                if (ret < 0) {
                    fprintf(stderr,
                            "KVM: injection failed, interrupt lost (%s)\n",
                            strerror(-ret));
                }
2048 2049
            }
        }
A
aliguori 已提交
2050

2051 2052 2053 2054
        /* If we have an interrupt but the guest is not ready to receive an
         * interrupt, request an interrupt window exit.  This will
         * cause a return to userspace as soon as the guest is ready to
         * receive interrupts. */
2055
        if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
2056 2057 2058 2059 2060 2061
            run->request_interrupt_window = 1;
        } else {
            run->request_interrupt_window = 0;
        }

        DPRINTF("setting tpr\n");
2062
        run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
2063
    }
A
aliguori 已提交
2064 2065
}

A
Andreas Färber 已提交
2066
void kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
A
aliguori 已提交
2067
{
A
Andreas Färber 已提交
2068 2069 2070
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;

2071
    if (run->if_flag) {
A
aliguori 已提交
2072
        env->eflags |= IF_MASK;
2073
    } else {
A
aliguori 已提交
2074
        env->eflags &= ~IF_MASK;
2075
    }
2076 2077
    cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8);
    cpu_set_apic_base(x86_cpu->apic_state, run->apic_base);
A
aliguori 已提交
2078 2079
}

A
Andreas Färber 已提交
2080
int kvm_arch_process_async_events(CPUState *cs)
M
Marcelo Tosatti 已提交
2081
{
A
Andreas Färber 已提交
2082 2083
    X86CPU *cpu = X86_CPU(cs);
    CPUX86State *env = &cpu->env;
2084

2085
    if (cs->interrupt_request & CPU_INTERRUPT_MCE) {
2086 2087 2088
        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
        assert(env->mcg_cap);

2089
        cs->interrupt_request &= ~CPU_INTERRUPT_MCE;
2090

2091
        kvm_cpu_synchronize_state(cs);
2092 2093 2094 2095

        if (env->exception_injected == EXCP08_DBLE) {
            /* this means triple fault */
            qemu_system_reset_request();
2096
            cs->exit_request = 1;
2097 2098 2099 2100 2101
            return 0;
        }
        env->exception_injected = EXCP12_MCHK;
        env->has_error_code = 0;

2102
        cs->halted = 0;
2103 2104 2105 2106 2107
        if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
            env->mp_state = KVM_MP_STATE_RUNNABLE;
        }
    }

2108 2109 2110 2111 2112
    if (cs->interrupt_request & CPU_INTERRUPT_INIT) {
        kvm_cpu_synchronize_state(cs);
        do_cpu_init(cpu);
    }

2113 2114 2115 2116
    if (kvm_irqchip_in_kernel()) {
        return 0;
    }

2117 2118
    if (cs->interrupt_request & CPU_INTERRUPT_POLL) {
        cs->interrupt_request &= ~CPU_INTERRUPT_POLL;
2119
        apic_poll_irq(cpu->apic_state);
2120
    }
2121
    if (((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
2122
         (env->eflags & IF_MASK)) ||
2123 2124
        (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
        cs->halted = 0;
2125
    }
2126
    if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
2127
        kvm_cpu_synchronize_state(cs);
2128
        do_cpu_sipi(cpu);
M
Marcelo Tosatti 已提交
2129
    }
2130 2131
    if (cs->interrupt_request & CPU_INTERRUPT_TPR) {
        cs->interrupt_request &= ~CPU_INTERRUPT_TPR;
2132
        kvm_cpu_synchronize_state(cs);
2133
        apic_handle_tpr_access_report(cpu->apic_state, env->eip,
2134 2135
                                      env->tpr_access_type);
    }
M
Marcelo Tosatti 已提交
2136

2137
    return cs->halted;
M
Marcelo Tosatti 已提交
2138 2139
}

2140
static int kvm_handle_halt(X86CPU *cpu)
A
aliguori 已提交
2141
{
2142
    CPUState *cs = CPU(cpu);
2143 2144
    CPUX86State *env = &cpu->env;

2145
    if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
A
aliguori 已提交
2146
          (env->eflags & IF_MASK)) &&
2147 2148
        !(cs->interrupt_request & CPU_INTERRUPT_NMI)) {
        cs->halted = 1;
2149
        return EXCP_HLT;
A
aliguori 已提交
2150 2151
    }

2152
    return 0;
A
aliguori 已提交
2153 2154
}

A
Andreas Färber 已提交
2155
static int kvm_handle_tpr_access(X86CPU *cpu)
2156
{
A
Andreas Färber 已提交
2157 2158
    CPUState *cs = CPU(cpu);
    struct kvm_run *run = cs->kvm_run;
2159

2160
    apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip,
2161 2162 2163 2164 2165
                                  run->tpr_access.is_write ? TPR_ACCESS_WRITE
                                                           : TPR_ACCESS_READ);
    return 1;
}

2166
int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
2167
{
2168
    static const uint8_t int3 = 0xcc;
2169

2170 2171
    if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
        cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) {
2172
        return -EINVAL;
2173
    }
2174 2175 2176
    return 0;
}

2177
int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
2178 2179 2180
{
    uint8_t int3;

2181 2182
    if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0) || int3 != 0xcc ||
        cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
2183
        return -EINVAL;
2184
    }
2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199
    return 0;
}

static struct {
    target_ulong addr;
    int len;
    int type;
} hw_breakpoint[4];

static int nb_hw_breakpoint;

static int find_hw_breakpoint(target_ulong addr, int len, int type)
{
    int n;

2200
    for (n = 0; n < nb_hw_breakpoint; n++) {
2201
        if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
2202
            (hw_breakpoint[n].len == len || len == -1)) {
2203
            return n;
2204 2205
        }
    }
2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223
    return -1;
}

int kvm_arch_insert_hw_breakpoint(target_ulong addr,
                                  target_ulong len, int type)
{
    switch (type) {
    case GDB_BREAKPOINT_HW:
        len = 1;
        break;
    case GDB_WATCHPOINT_WRITE:
    case GDB_WATCHPOINT_ACCESS:
        switch (len) {
        case 1:
            break;
        case 2:
        case 4:
        case 8:
2224
            if (addr & (len - 1)) {
2225
                return -EINVAL;
2226
            }
2227 2228 2229 2230 2231 2232 2233 2234 2235
            break;
        default:
            return -EINVAL;
        }
        break;
    default:
        return -ENOSYS;
    }

2236
    if (nb_hw_breakpoint == 4) {
2237
        return -ENOBUFS;
2238 2239
    }
    if (find_hw_breakpoint(addr, len, type) >= 0) {
2240
        return -EEXIST;
2241
    }
2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255
    hw_breakpoint[nb_hw_breakpoint].addr = addr;
    hw_breakpoint[nb_hw_breakpoint].len = len;
    hw_breakpoint[nb_hw_breakpoint].type = type;
    nb_hw_breakpoint++;

    return 0;
}

int kvm_arch_remove_hw_breakpoint(target_ulong addr,
                                  target_ulong len, int type)
{
    int n;

    n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
2256
    if (n < 0) {
2257
        return -ENOENT;
2258
    }
2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271
    nb_hw_breakpoint--;
    hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];

    return 0;
}

void kvm_arch_remove_all_hw_breakpoints(void)
{
    nb_hw_breakpoint = 0;
}

static CPUWatchpoint hw_watchpoint;

2272
static int kvm_handle_debug(X86CPU *cpu,
B
Blue Swirl 已提交
2273
                            struct kvm_debug_exit_arch *arch_info)
2274
{
2275
    CPUState *cs = CPU(cpu);
2276
    CPUX86State *env = &cpu->env;
2277
    int ret = 0;
2278 2279 2280 2281
    int n;

    if (arch_info->exception == 1) {
        if (arch_info->dr6 & (1 << 14)) {
2282
            if (cs->singlestep_enabled) {
2283
                ret = EXCP_DEBUG;
2284
            }
2285
        } else {
2286 2287
            for (n = 0; n < 4; n++) {
                if (arch_info->dr6 & (1 << n)) {
2288 2289
                    switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
                    case 0x0:
2290
                        ret = EXCP_DEBUG;
2291 2292
                        break;
                    case 0x1:
2293
                        ret = EXCP_DEBUG;
2294
                        cs->watchpoint_hit = &hw_watchpoint;
2295 2296 2297 2298
                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
                        hw_watchpoint.flags = BP_MEM_WRITE;
                        break;
                    case 0x3:
2299
                        ret = EXCP_DEBUG;
2300
                        cs->watchpoint_hit = &hw_watchpoint;
2301 2302 2303 2304
                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
                        hw_watchpoint.flags = BP_MEM_ACCESS;
                        break;
                    }
2305 2306
                }
            }
2307
        }
2308
    } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) {
2309
        ret = EXCP_DEBUG;
2310
    }
2311
    if (ret == 0) {
2312
        cpu_synchronize_state(cs);
B
Blue Swirl 已提交
2313
        assert(env->exception_injected == -1);
2314

2315
        /* pass to guest */
B
Blue Swirl 已提交
2316 2317
        env->exception_injected = arch_info->exception;
        env->has_error_code = 0;
2318
    }
2319

2320
    return ret;
2321 2322
}

A
Andreas Färber 已提交
2323
void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334
{
    const uint8_t type_code[] = {
        [GDB_BREAKPOINT_HW] = 0x0,
        [GDB_WATCHPOINT_WRITE] = 0x1,
        [GDB_WATCHPOINT_ACCESS] = 0x3
    };
    const uint8_t len_code[] = {
        [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
    };
    int n;

2335
    if (kvm_sw_breakpoints_active(cpu)) {
2336
        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
2337
    }
2338 2339 2340 2341 2342 2343 2344
    if (nb_hw_breakpoint > 0) {
        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
        dbg->arch.debugreg[7] = 0x0600;
        for (n = 0; n < nb_hw_breakpoint; n++) {
            dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
            dbg->arch.debugreg[7] |= (2 << (n * 2)) |
                (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
2345
                ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
2346 2347 2348
        }
    }
}
2349

2350 2351 2352 2353 2354 2355 2356 2357 2358 2359
static bool host_supports_vmx(void)
{
    uint32_t ecx, unused;

    host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
    return ecx & CPUID_EXT_VMX;
}

#define VMX_INVALID_GUEST_STATE 0x80000021

A
Andreas Färber 已提交
2360
int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
2361
{
A
Andreas Färber 已提交
2362
    X86CPU *cpu = X86_CPU(cs);
2363 2364 2365 2366 2367 2368
    uint64_t code;
    int ret;

    switch (run->exit_reason) {
    case KVM_EXIT_HLT:
        DPRINTF("handle_hlt\n");
2369
        ret = kvm_handle_halt(cpu);
2370 2371 2372 2373
        break;
    case KVM_EXIT_SET_TPR:
        ret = 0;
        break;
2374
    case KVM_EXIT_TPR_ACCESS:
A
Andreas Färber 已提交
2375
        ret = kvm_handle_tpr_access(cpu);
2376
        break;
2377 2378 2379 2380 2381 2382
    case KVM_EXIT_FAIL_ENTRY:
        code = run->fail_entry.hardware_entry_failure_reason;
        fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
                code);
        if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
            fprintf(stderr,
V
Vagrant Cascadian 已提交
2383
                    "\nIf you're running a guest on an Intel machine without "
2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398
                        "unrestricted mode\n"
                    "support, the failure can be most likely due to the guest "
                        "entering an invalid\n"
                    "state for Intel VT. For example, the guest maybe running "
                        "in big real mode\n"
                    "which is not supported on less recent Intel processors."
                        "\n\n");
        }
        ret = -1;
        break;
    case KVM_EXIT_EXCEPTION:
        fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
                run->ex.exception, run->ex.error_code);
        ret = -1;
        break;
2399 2400
    case KVM_EXIT_DEBUG:
        DPRINTF("kvm_exit_debug\n");
2401
        ret = kvm_handle_debug(cpu, &run->debug.arch);
2402
        break;
2403 2404 2405 2406 2407 2408 2409 2410 2411
    default:
        fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
        ret = -1;
        break;
    }

    return ret;
}

A
Andreas Färber 已提交
2412
bool kvm_arch_stop_on_emulation_error(CPUState *cs)
2413
{
A
Andreas Färber 已提交
2414 2415 2416
    X86CPU *cpu = X86_CPU(cs);
    CPUX86State *env = &cpu->env;

2417
    kvm_cpu_synchronize_state(cs);
2418 2419
    return !(env->cr[0] & CR0_PE_MASK) ||
           ((env->segs[R_CS].selector  & 3) != 3);
2420
}
2421 2422 2423 2424 2425 2426 2427 2428 2429 2430

void kvm_arch_init_irq_routing(KVMState *s)
{
    if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) {
        /* If kernel can't do irq routing, interrupt source
         * override 0->2 cannot be set up as required by HPET.
         * So we have to disable it.
         */
        no_hpet = 1;
    }
2431
    /* We know at this point that we're using the in-kernel
2432
     * irqchip, so we can use irqfds, and on x86 we know
2433
     * we can use msi via irqfd and GSI routing.
2434 2435
     */
    kvm_irqfds_allowed = true;
2436
    kvm_msi_via_irqfd_allowed = true;
2437
    kvm_gsi_routing_allowed = true;
2438
}
2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578

/* Classic KVM device assignment interface. Will remain x86 only. */
int kvm_device_pci_assign(KVMState *s, PCIHostDeviceAddress *dev_addr,
                          uint32_t flags, uint32_t *dev_id)
{
    struct kvm_assigned_pci_dev dev_data = {
        .segnr = dev_addr->domain,
        .busnr = dev_addr->bus,
        .devfn = PCI_DEVFN(dev_addr->slot, dev_addr->function),
        .flags = flags,
    };
    int ret;

    dev_data.assigned_dev_id =
        (dev_addr->domain << 16) | (dev_addr->bus << 8) | dev_data.devfn;

    ret = kvm_vm_ioctl(s, KVM_ASSIGN_PCI_DEVICE, &dev_data);
    if (ret < 0) {
        return ret;
    }

    *dev_id = dev_data.assigned_dev_id;

    return 0;
}

int kvm_device_pci_deassign(KVMState *s, uint32_t dev_id)
{
    struct kvm_assigned_pci_dev dev_data = {
        .assigned_dev_id = dev_id,
    };

    return kvm_vm_ioctl(s, KVM_DEASSIGN_PCI_DEVICE, &dev_data);
}

static int kvm_assign_irq_internal(KVMState *s, uint32_t dev_id,
                                   uint32_t irq_type, uint32_t guest_irq)
{
    struct kvm_assigned_irq assigned_irq = {
        .assigned_dev_id = dev_id,
        .guest_irq = guest_irq,
        .flags = irq_type,
    };

    if (kvm_check_extension(s, KVM_CAP_ASSIGN_DEV_IRQ)) {
        return kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ, &assigned_irq);
    } else {
        return kvm_vm_ioctl(s, KVM_ASSIGN_IRQ, &assigned_irq);
    }
}

int kvm_device_intx_assign(KVMState *s, uint32_t dev_id, bool use_host_msi,
                           uint32_t guest_irq)
{
    uint32_t irq_type = KVM_DEV_IRQ_GUEST_INTX |
        (use_host_msi ? KVM_DEV_IRQ_HOST_MSI : KVM_DEV_IRQ_HOST_INTX);

    return kvm_assign_irq_internal(s, dev_id, irq_type, guest_irq);
}

int kvm_device_intx_set_mask(KVMState *s, uint32_t dev_id, bool masked)
{
    struct kvm_assigned_pci_dev dev_data = {
        .assigned_dev_id = dev_id,
        .flags = masked ? KVM_DEV_ASSIGN_MASK_INTX : 0,
    };

    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_INTX_MASK, &dev_data);
}

static int kvm_deassign_irq_internal(KVMState *s, uint32_t dev_id,
                                     uint32_t type)
{
    struct kvm_assigned_irq assigned_irq = {
        .assigned_dev_id = dev_id,
        .flags = type,
    };

    return kvm_vm_ioctl(s, KVM_DEASSIGN_DEV_IRQ, &assigned_irq);
}

int kvm_device_intx_deassign(KVMState *s, uint32_t dev_id, bool use_host_msi)
{
    return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_INTX |
        (use_host_msi ? KVM_DEV_IRQ_HOST_MSI : KVM_DEV_IRQ_HOST_INTX));
}

int kvm_device_msi_assign(KVMState *s, uint32_t dev_id, int virq)
{
    return kvm_assign_irq_internal(s, dev_id, KVM_DEV_IRQ_HOST_MSI |
                                              KVM_DEV_IRQ_GUEST_MSI, virq);
}

int kvm_device_msi_deassign(KVMState *s, uint32_t dev_id)
{
    return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSI |
                                                KVM_DEV_IRQ_HOST_MSI);
}

bool kvm_device_msix_supported(KVMState *s)
{
    /* The kernel lacks a corresponding KVM_CAP, so we probe by calling
     * KVM_ASSIGN_SET_MSIX_NR with an invalid parameter. */
    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_NR, NULL) == -EFAULT;
}

int kvm_device_msix_init_vectors(KVMState *s, uint32_t dev_id,
                                 uint32_t nr_vectors)
{
    struct kvm_assigned_msix_nr msix_nr = {
        .assigned_dev_id = dev_id,
        .entry_nr = nr_vectors,
    };

    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_NR, &msix_nr);
}

int kvm_device_msix_set_vector(KVMState *s, uint32_t dev_id, uint32_t vector,
                               int virq)
{
    struct kvm_assigned_msix_entry msix_entry = {
        .assigned_dev_id = dev_id,
        .gsi = virq,
        .entry = vector,
    };

    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_ENTRY, &msix_entry);
}

int kvm_device_msix_assign(KVMState *s, uint32_t dev_id)
{
    return kvm_assign_irq_internal(s, dev_id, KVM_DEV_IRQ_HOST_MSIX |
                                              KVM_DEV_IRQ_GUEST_MSIX, 0);
}

int kvm_device_msix_deassign(KVMState *s, uint32_t dev_id)
{
    return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSIX |
                                                KVM_DEV_IRQ_HOST_MSIX);
}