diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 512d53397041371534ca3dd47701448fb490148f..ae0a034ab0f2048adfe3aaa4148f0e5263f7b22b 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -53,6 +54,8 @@ #define BUS_MCEERR_AO 5 #endif +static int lm_capable_kernel; + #ifdef KVM_CAP_EXT_CPUID static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max) @@ -239,12 +242,16 @@ static void kvm_do_inject_x86_mce(void *_data) struct kvm_x86_mce_data *data = _data; int r; - /* If there is an MCE excpetion being processed, ignore this SRAO MCE */ - r = kvm_mce_in_exception(data->env); - if (r == -1) - fprintf(stderr, "Failed to get MCE status\n"); - else if (r && !(data->mce->status & MCI_STATUS_AR)) - return; + /* If there is an MCE exception being processed, ignore this SRAO MCE */ + if ((data->env->mcg_cap & MCG_SER_P) && + !(data->mce->status & MCI_STATUS_AR)) { + r = kvm_mce_in_exception(data->env); + if (r == -1) { + fprintf(stderr, "Failed to get MCE status\n"); + } else if (r) { + return; + } + } r = kvm_set_mce(data->env, data->mce); if (r < 0) { @@ -434,23 +441,26 @@ void kvm_arch_reset_vcpu(CPUState *env) } } -static int kvm_has_msr_star(CPUState *env) +int has_msr_star; +int has_msr_hsave_pa; + +static void kvm_supported_msrs(CPUState *env) { - static int has_msr_star; + static int kvm_supported_msrs; int ret; /* first time */ - if (has_msr_star == 0) { + if (kvm_supported_msrs == 0) { struct kvm_msr_list msr_list, *kvm_msr_list; - has_msr_star = -1; + kvm_supported_msrs = -1; /* Obtain MSR list from KVM. These are the MSRs that we must * save/restore */ msr_list.nmsrs = 0; ret = kvm_ioctl(env->kvm_state, KVM_GET_MSR_INDEX_LIST, &msr_list); if (ret < 0 && ret != -E2BIG) { - return 0; + return; } /* Old kernel modules had a bug and could write beyond the provided memory. Allocate at least a safe amount of 1K. */ @@ -466,7 +476,11 @@ static int kvm_has_msr_star(CPUState *env) for (i = 0; i < kvm_msr_list->nmsrs; i++) { if (kvm_msr_list->indices[i] == MSR_STAR) { has_msr_star = 1; - break; + continue; + } + if (kvm_msr_list->indices[i] == MSR_VM_HSAVE_PA) { + has_msr_hsave_pa = 1; + continue; } } } @@ -474,9 +488,19 @@ static int kvm_has_msr_star(CPUState *env) free(kvm_msr_list); } - if (has_msr_star == 1) - return 1; - return 0; + return; +} + +static int kvm_has_msr_hsave_pa(CPUState *env) +{ + kvm_supported_msrs(env); + return has_msr_hsave_pa; +} + +static int kvm_has_msr_star(CPUState *env) +{ + kvm_supported_msrs(env); + return has_msr_star; } static int kvm_init_identity_map_page(KVMState *s) @@ -502,6 +526,11 @@ int kvm_arch_init(KVMState *s, int smp_cpus) { int ret; + struct utsname utsname; + + uname(&utsname); + lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0; + /* create vm86 tss. KVM uses vm86 mode to emulate 16-bit code * directly. In order to use vm86 mode, a TSS is needed. Since this * must be part of guest physical memory, we need to allocate it. Older @@ -779,28 +808,40 @@ static int kvm_put_msrs(CPUState *env, int level) struct kvm_msr_entry entries[100]; } msr_data; struct kvm_msr_entry *msrs = msr_data.entries; - int i, n = 0; + int n = 0; kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs); kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp); kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip); if (kvm_has_msr_star(env)) kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star); + if (kvm_has_msr_hsave_pa(env)) + kvm_msr_entry_set(&msrs[n++], MSR_VM_HSAVE_PA, env->vm_hsave); #ifdef TARGET_X86_64 - /* FIXME if lm capable */ - kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar); - kvm_msr_entry_set(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase); - kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask); - kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar); + if (lm_capable_kernel) { + kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar); + kvm_msr_entry_set(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase); + kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask); + kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar); + } #endif if (level == KVM_PUT_FULL_STATE) { - kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc); + /* + * KVM is yet unable to synchronize TSC values of multiple VCPUs on + * writeback. Until this is fixed, we only write the offset to SMP + * guests after migration, desynchronizing the VCPUs, but avoiding + * huge jump-backs that would occur without any writeback at all. + */ + if (smp_cpus == 1 || env->tsc != 0) { + kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc); + } kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME, env->system_time_msr); kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr); } #ifdef KVM_CAP_MCE if (env->mcg_cap) { + int i; if (level == KVM_PUT_RESET_STATE) kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status); else if (level == KVM_PUT_FULL_STATE) { @@ -1010,13 +1051,16 @@ static int kvm_get_msrs(CPUState *env) msrs[n++].index = MSR_IA32_SYSENTER_EIP; if (kvm_has_msr_star(env)) msrs[n++].index = MSR_STAR; + if (kvm_has_msr_hsave_pa(env)) + msrs[n++].index = MSR_VM_HSAVE_PA; msrs[n++].index = MSR_IA32_TSC; #ifdef TARGET_X86_64 - /* FIXME lm_capable_kernel */ - msrs[n++].index = MSR_CSTAR; - msrs[n++].index = MSR_KERNELGSBASE; - msrs[n++].index = MSR_FMASK; - msrs[n++].index = MSR_LSTAR; + if (lm_capable_kernel) { + msrs[n++].index = MSR_CSTAR; + msrs[n++].index = MSR_KERNELGSBASE; + msrs[n++].index = MSR_FMASK; + msrs[n++].index = MSR_LSTAR; + } #endif msrs[n++].index = MSR_KVM_SYSTEM_TIME; msrs[n++].index = MSR_KVM_WALL_CLOCK; @@ -1066,6 +1110,9 @@ static int kvm_get_msrs(CPUState *env) case MSR_IA32_TSC: env->tsc = msrs[i].data; break; + case MSR_VM_HSAVE_PA: + env->vm_hsave = msrs[i].data; + break; case MSR_KVM_SYSTEM_TIME: env->system_time_msr = msrs[i].data; break; @@ -1085,9 +1132,9 @@ static int kvm_get_msrs(CPUState *env) if (msrs[i].index >= MSR_MC0_CTL && msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) { env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data; - break; } #endif + break; } } @@ -1632,6 +1679,28 @@ static void hardware_memory_error(void) exit(1); } +#ifdef KVM_CAP_MCE +static void kvm_mce_broadcast_rest(CPUState *env) +{ + CPUState *cenv; + int family, model, cpuver = env->cpuid_version; + + family = (cpuver >> 8) & 0xf; + model = ((cpuver >> 12) & 0xf0) + ((cpuver >> 4) & 0xf); + + /* Broadcast MCA signal for processor version 06H_EH and above */ + if ((family == 6 && model >= 14) || family > 6) { + for (cenv = first_cpu; cenv != NULL; cenv = cenv->next_cpu) { + if (cenv == env) { + continue; + } + kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC, + MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1); + } + } +} +#endif + int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr) { #if defined(KVM_CAP_MCE) @@ -1689,6 +1758,7 @@ int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr) fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno)); abort(); } + kvm_mce_broadcast_rest(env); } else #endif { @@ -1711,7 +1781,6 @@ int kvm_on_sigbus(int code, void *addr) void *vaddr; ram_addr_t ram_addr; target_phys_addr_t paddr; - CPUState *cenv; /* Hope we are lucky for AO MCE */ vaddr = addr; @@ -1727,10 +1796,7 @@ int kvm_on_sigbus(int code, void *addr) kvm_inject_x86_mce(first_cpu, 9, status, MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr, (MCM_ADDR_PHYS << 6) | 0xc, 1); - for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) { - kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC, - MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1); - } + kvm_mce_broadcast_rest(first_cpu); } else #endif {