提交 d8242d22 编写于 作者: L Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "Bugfixes, many of them reported by syzkaller and mostly predating the
  merge window"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  kvm: svm: Ensure an IBPB on all affected CPUs when freeing a vmcb
  kvm: mmu: Fix race in emulated page table writes
  KVM: nVMX: vmcs12 revision_id is always VMCS12_REVISION even when copied from eVMCS
  KVM: nVMX: Verify eVMCS revision id match supported eVMCS version on eVMCS VMPTRLD
  KVM: nVMX/nSVM: Fix bug which sets vcpu->arch.tsc_offset to L1 tsc_offset
  x86/kvm/vmx: fix old-style function declaration
  KVM: x86: fix empty-body warnings
  KVM: VMX: Update shared MSRs to be saved/restored on MSR_EFER.LMA changes
  KVM: x86: Fix kernel info-leak in KVM_HC_CLOCK_PAIRING hypercall
  KVM: nVMX: Fix kernel info-leak when enabling KVM_CAP_HYPERV_ENLIGHTENED_VMCS more than once
  svm: Add mutex_lock to protect apic_access_page_done on AMD systems
  KVM: X86: Fix scan ioapic use-before-initialization
  KVM: LAPIC: Fix pv ipis use-before-initialization
  KVM: VMX: re-add ple_gap module parameter
  KVM: PPC: Book3S HV: Fix handling for interrupted H_ENTER_NESTED
......@@ -983,6 +983,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
ret = kvmhv_enter_nested_guest(vcpu);
if (ret == H_INTERRUPT) {
kvmppc_set_gpr(vcpu, 3, 0);
vcpu->arch.hcall_needed = 0;
return -EINTR;
}
break;
......
......@@ -1094,7 +1094,8 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
/* Returns actual tsc_offset set in active VMCS */
u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
......
......@@ -55,7 +55,7 @@
#define PRIo64 "o"
/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
#define apic_debug(fmt, arg...)
#define apic_debug(fmt, arg...) do {} while (0)
/* 14 is the version for Xeon and Pentium 8.4.8*/
#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
......@@ -576,6 +576,11 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
rcu_read_lock();
map = rcu_dereference(kvm->arch.apic_map);
if (unlikely(!map)) {
count = -EOPNOTSUPP;
goto out;
}
if (min > map->max_apic_id)
goto out;
/* Bits above cluster_size are masked in the caller. */
......
......@@ -5074,9 +5074,9 @@ static bool need_remote_flush(u64 old, u64 new)
}
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
const u8 *new, int *bytes)
int *bytes)
{
u64 gentry;
u64 gentry = 0;
int r;
/*
......@@ -5088,22 +5088,12 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
*gpa &= ~(gpa_t)7;
*bytes = 8;
r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8);
if (r)
gentry = 0;
new = (const u8 *)&gentry;
}
switch (*bytes) {
case 4:
gentry = *(const u32 *)new;
break;
case 8:
gentry = *(const u64 *)new;
break;
default:
gentry = 0;
break;
if (*bytes == 4 || *bytes == 8) {
r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
if (r)
gentry = 0;
}
return gentry;
......@@ -5207,8 +5197,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes);
/*
* No need to care whether allocation memory is successful
* or not since pte prefetch is skiped if it does not have
......@@ -5217,6 +5205,9 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mmu_topup_memory_caches(vcpu);
spin_lock(&vcpu->kvm->mmu_lock);
gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
......
......@@ -1446,7 +1446,7 @@ static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
return vcpu->arch.tsc_offset;
}
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 g_tsc_offset = 0;
......@@ -1464,6 +1464,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
return svm->vmcb->control.tsc_offset;
}
static void avic_init_vmcb(struct vcpu_svm *svm)
......@@ -1664,20 +1665,23 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
static int avic_init_access_page(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
int ret;
int ret = 0;
mutex_lock(&kvm->slots_lock);
if (kvm->arch.apic_access_page_done)
return 0;
goto out;
ret = x86_set_memory_region(kvm,
APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
APIC_DEFAULT_PHYS_BASE,
PAGE_SIZE);
ret = __x86_set_memory_region(kvm,
APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
APIC_DEFAULT_PHYS_BASE,
PAGE_SIZE);
if (ret)
return ret;
goto out;
kvm->arch.apic_access_page_done = true;
return 0;
out:
mutex_unlock(&kvm->slots_lock);
return ret;
}
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
......@@ -2189,21 +2193,31 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
return ERR_PTR(err);
}
static void svm_clear_current_vmcb(struct vmcb *vmcb)
{
int i;
for_each_online_cpu(i)
cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
}
static void svm_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
/*
* The vmcb page can be recycled, causing a false negative in
* svm_vcpu_load(). So, ensure that no logical CPU has this
* vmcb page recorded as its current vmcb.
*/
svm_clear_current_vmcb(svm->vmcb);
__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
__free_page(virt_to_page(svm->nested.hsave));
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
/*
* The vmcb page can be recycled, causing a false negative in
* svm_vcpu_load(). So do a full IBPB now.
*/
indirect_branch_prediction_barrier();
}
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
......@@ -7149,7 +7163,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
.read_l1_tsc_offset = svm_read_l1_tsc_offset,
.write_tsc_offset = svm_write_tsc_offset,
.write_l1_tsc_offset = svm_write_l1_tsc_offset,
.set_tdp_cr3 = set_tdp_cr3,
......
......@@ -174,6 +174,7 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
* refer SDM volume 3b section 21.6.13 & 22.1.3.
*/
static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
module_param(ple_gap, uint, 0444);
static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, uint, 0444);
......@@ -984,6 +985,7 @@ struct vcpu_vmx {
struct shared_msr_entry *guest_msrs;
int nmsrs;
int save_nmsrs;
bool guest_msrs_dirty;
unsigned long host_idt_base;
#ifdef CONFIG_X86_64
u64 msr_host_kernel_gs_base;
......@@ -1306,7 +1308,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
u16 error_code);
static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
......@@ -1610,12 +1612,6 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
/* We don't support disabling the feature for simplicity. */
if (vmx->nested.enlightened_vmcs_enabled)
return 0;
vmx->nested.enlightened_vmcs_enabled = true;
/*
* vmcs_version represents the range of supported Enlightened VMCS
* versions: lower 8 bits is the minimal version, higher 8 bits is the
......@@ -1625,6 +1621,12 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
if (vmcs_version)
*vmcs_version = (KVM_EVMCS_VERSION << 8) | 1;
/* We don't support disabling the feature for simplicity. */
if (vmx->nested.enlightened_vmcs_enabled)
return 0;
vmx->nested.enlightened_vmcs_enabled = true;
vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
......@@ -2897,6 +2899,20 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
vmx->req_immediate_exit = false;
/*
* Note that guest MSRs to be saved/restored can also be changed
* when guest state is loaded. This happens when guest transitions
* to/from long-mode by setting MSR_EFER.LMA.
*/
if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) {
vmx->guest_msrs_dirty = false;
for (i = 0; i < vmx->save_nmsrs; ++i)
kvm_set_shared_msr(vmx->guest_msrs[i].index,
vmx->guest_msrs[i].data,
vmx->guest_msrs[i].mask);
}
if (vmx->loaded_cpu_state)
return;
......@@ -2957,11 +2973,6 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
vmcs_writel(HOST_GS_BASE, gs_base);
host_state->gs_base = gs_base;
}
for (i = 0; i < vmx->save_nmsrs; ++i)
kvm_set_shared_msr(vmx->guest_msrs[i].index,
vmx->guest_msrs[i].data,
vmx->guest_msrs[i].mask);
}
static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
......@@ -3436,6 +3447,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
move_msr_up(vmx, index, save_nmsrs++);
vmx->save_nmsrs = save_nmsrs;
vmx->guest_msrs_dirty = true;
if (cpu_has_vmx_msr_bitmap())
vmx_update_msr_bitmap(&vmx->vcpu);
......@@ -3452,11 +3464,9 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
return vcpu->arch.tsc_offset;
}
/*
* writes 'offset' into guest's timestamp counter offset register
*/
static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
u64 active_offset = offset;
if (is_guest_mode(vcpu)) {
/*
* We're here if L1 chose not to trap WRMSR to TSC. According
......@@ -3464,17 +3474,16 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
* set for L2 remains unchanged, and still needs to be added
* to the newly set TSC to get L2's TSC.
*/
struct vmcs12 *vmcs12;
/* recalculate vmcs02.TSC_OFFSET: */
vmcs12 = get_vmcs12(vcpu);
vmcs_write64(TSC_OFFSET, offset +
(nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
vmcs12->tsc_offset : 0));
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING))
active_offset += vmcs12->tsc_offset;
} else {
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
vmcs_read64(TSC_OFFSET), offset);
vmcs_write64(TSC_OFFSET, offset);
}
vmcs_write64(TSC_OFFSET, active_offset);
return active_offset;
}
/*
......@@ -5944,7 +5953,7 @@ static void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock);
}
static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type)
{
int f = sizeof(unsigned long);
......@@ -5982,7 +5991,7 @@ static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bit
}
}
static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type)
{
int f = sizeof(unsigned long);
......@@ -6020,7 +6029,7 @@ static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitm
}
}
static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type, bool value)
{
if (value)
......@@ -8664,8 +8673,6 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
vmcs12->hdr.revision_id = evmcs->revision_id;
/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
vmcs12->tpr_threshold = evmcs->tpr_threshold;
vmcs12->guest_rip = evmcs->guest_rip;
......@@ -9369,7 +9376,30 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) {
/*
* Currently, KVM only supports eVMCS version 1
* (== KVM_EVMCS_VERSION) and thus we expect guest to set this
* value to first u32 field of eVMCS which should specify eVMCS
* VersionNumber.
*
* Guest should be aware of supported eVMCS versions by host by
* examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
* expected to set this CPUID leaf according to the value
* returned in vmcs_version from nested_enable_evmcs().
*
* However, it turns out that Microsoft Hyper-V fails to comply
* to their own invented interface: When Hyper-V use eVMCS, it
* just sets first u32 field of eVMCS to revision_id specified
* in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
* which is one of the supported versions specified in
* CPUID.0x4000000A.EAX[0:15].
*
* To overcome Hyper-V bug, we accept here either a supported
* eVMCS version or VMCS12 revision_id as valid values for first
* u32 field of eVMCS.
*/
if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
(vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
nested_release_evmcs(vcpu);
return 0;
}
......@@ -9390,9 +9420,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
* present in struct hv_enlightened_vmcs, ...). Make sure there
* are no leftovers.
*/
if (from_launch)
memset(vmx->nested.cached_vmcs12, 0,
sizeof(*vmx->nested.cached_vmcs12));
if (from_launch) {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
memset(vmcs12, 0, sizeof(*vmcs12));
vmcs12->hdr.revision_id = VMCS12_REVISION;
}
}
return 1;
......@@ -15062,7 +15094,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
.read_l1_tsc_offset = vmx_read_l1_tsc_offset,
.write_tsc_offset = vmx_write_tsc_offset,
.write_l1_tsc_offset = vmx_write_l1_tsc_offset,
.set_tdp_cr3 = vmx_set_cr3,
......
......@@ -1665,8 +1665,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
kvm_x86_ops->write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_offset = offset;
vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
}
static inline bool kvm_check_tsc_unstable(void)
......@@ -1794,7 +1793,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
s64 adjustment)
{
kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment);
u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
}
static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
......@@ -6918,6 +6918,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
clock_pairing.nsec = ts.tv_nsec;
clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
clock_pairing.flags = 0;
memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
ret = 0;
if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
......@@ -7455,7 +7456,8 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
else {
if (vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
if (is_guest_mode(vcpu))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册