提交 e37a07e0 编写于 作者: L Linus Torvalds

Merge tag 'kvm-4.13-2' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull more KVM updates from Radim Krčmář:
 "Second batch of KVM updates for v4.13

  Common:
   - add uevents for VM creation/destruction
   - annotate and properly access RCU-protected objects

  s390:
   - rename IOCTL added in the first v4.13 merge

  x86:
   - emulate VMLOAD VMSAVE feature in SVM
   - support paravirtual asynchronous page fault while nested
   - add Hyper-V userspace interfaces for better migration
   - improve master clock corner cases
   - extend internal error reporting after EPT misconfig
   - correct single-stepping of emulated instructions in SVM
   - handle MCE during VM entry
   - fix nVMX VM entry checks and nVMX VMCS shadowing"

* tag 'kvm-4.13-2' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (28 commits)
  kvm: x86: hyperv: make VP_INDEX managed by userspace
  KVM: async_pf: Let guest support delivery of async_pf from guest mode
  KVM: async_pf: Force a nested vmexit if the injected #PF is async_pf
  KVM: async_pf: Add L1 guest async_pf #PF vmexit handler
  KVM: x86: Simplify kvm_x86_ops->queue_exception parameter list
  kvm: x86: hyperv: add KVM_CAP_HYPERV_SYNIC2
  KVM: x86: make backwards_tsc_observed a per-VM variable
  KVM: trigger uevents when creating or destroying a VM
  KVM: SVM: Enable Virtual VMLOAD VMSAVE feature
  KVM: SVM: Add Virtual VMLOAD VMSAVE feature definition
  KVM: SVM: Rename lbr_ctl field in the vmcb control area
  KVM: SVM: Prepare for new bit definition in lbr_ctl
  KVM: SVM: handle singlestep exception when skipping emulated instructions
  KVM: x86: take slots_lock in kvm_free_pit
  KVM: s390: Fix KVM_S390_GET_CMMA_BITS ioctl definition
  kvm: vmx: Properly handle machine check during VM-entry
  KVM: x86: update master clock before computing kvmclock_offset
  kvm: nVMX: Shadow "high" parts of shadowed 64-bit VMCS fields
  kvm: nVMX: Fix nested_vmx_check_msr_bitmap_controls
  kvm: nVMX: Validate the I/O bitmaps on nested VM-entry
  ...
......@@ -4329,3 +4329,21 @@ Querying this capability returns a bitmap indicating the possible
virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N
(counting from the right) is set, then a virtual SMT mode of 2^N is
available.
8.11 KVM_CAP_HYPERV_SYNIC2
Architectures: x86
This capability enables a newer version of Hyper-V Synthetic interrupt
controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM
doesn't clear SynIC message and event flags pages when they are enabled by
writing to the respective MSRs.
8.12 KVM_CAP_HYPERV_VP_INDEX
Architectures: x86
This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr. Its
value is used to denote the target vcpu for a SynIC interrupt. For
compatibilty, KVM initializes this msr to KVM's internal vcpu index. When this
capability is absent, userspace can still query this msr's value.
......@@ -166,10 +166,11 @@ MSR_KVM_SYSTEM_TIME: 0x12
MSR_KVM_ASYNC_PF_EN: 0x4b564d02
data: Bits 63-6 hold 64-byte aligned physical address of a
64 byte memory area which must be in guest RAM and must be
zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1
zeroed. Bits 5-3 are reserved and should be zero. Bit 0 is 1
when asynchronous page faults are enabled on the vcpu 0 when
disabled. Bit 1 is 1 if asynchronous page faults can be injected
when vcpu is in cpl == 0.
when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults
are delivered to L1 as #PF vmexits.
First 4 byte of 64 byte memory location will be written to by
the hypervisor at the time of asynchronous page fault (APF)
......
......@@ -286,6 +286,7 @@
#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
#define X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE (15*32+15) /* Virtual VMLOAD VMSAVE */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
......
......@@ -23,6 +23,7 @@ struct x86_exception {
u16 error_code;
bool nested_page_fault;
u64 address; /* cr2 or nested page fault gpa */
u8 async_page_fault;
};
/*
......
......@@ -462,10 +462,12 @@ struct kvm_vcpu_hv_synic {
DECLARE_BITMAP(auto_eoi_bitmap, 256);
DECLARE_BITMAP(vec_bitmap, 256);
bool active;
bool dont_zero_synic_pages;
};
/* Hyper-V per vcpu emulation context */
struct kvm_vcpu_hv {
u32 vp_index;
u64 hv_vapic;
s64 runtime_offset;
struct kvm_vcpu_hv_synic synic;
......@@ -549,6 +551,7 @@ struct kvm_vcpu_arch {
bool reinject;
u8 nr;
u32 error_code;
u8 nested_apf;
} exception;
struct kvm_queued_interrupt {
......@@ -649,6 +652,9 @@ struct kvm_vcpu_arch {
u64 msr_val;
u32 id;
bool send_user_only;
u32 host_apf_reason;
unsigned long nested_apf_token;
bool delivery_as_pf_vmexit;
} apf;
/* OSVW MSRs (AMD only) */
......@@ -803,6 +809,7 @@ struct kvm_arch {
int audit_point;
#endif
bool backwards_tsc_observed;
bool boot_vcpu_runs_old_kvmclock;
u32 bsp_vcpu_id;
......@@ -952,9 +959,7 @@ struct kvm_x86_ops {
unsigned char *hypercall_addr);
void (*set_irq)(struct kvm_vcpu *vcpu);
void (*set_nmi)(struct kvm_vcpu *vcpu);
void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject);
void (*queue_exception)(struct kvm_vcpu *vcpu);
void (*cancel_injection)(struct kvm_vcpu *vcpu);
int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
int (*nmi_allowed)(struct kvm_vcpu *vcpu);
......
......@@ -83,7 +83,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 event_inj;
u32 event_inj_err;
u64 nested_cr3;
u64 lbr_ctl;
u64 virt_ext;
u32 clean;
u32 reserved_5;
u64 next_rip;
......@@ -119,6 +119,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_ENABLE_SHIFT 31
#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
#define LBR_CTL_ENABLE_MASK BIT_ULL(0)
#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
#define SVM_INTERRUPT_SHADOW_MASK 1
#define SVM_IOIO_STR_SHIFT 2
......
......@@ -67,6 +67,7 @@ struct kvm_clock_pairing {
#define KVM_ASYNC_PF_ENABLED (1 << 0)
#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
#define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT (1 << 2)
/* Operations for KVM_HC_MMU_OP */
#define KVM_MMU_OP_WRITE_PTE 1
......
......@@ -330,7 +330,12 @@ static void kvm_guest_cpu_init(void)
#ifdef CONFIG_PREEMPT
pa |= KVM_ASYNC_PF_SEND_ALWAYS;
#endif
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
pa |= KVM_ASYNC_PF_ENABLED;
/* Async page fault support for L1 hypervisor is optional */
if (wrmsr_safe(MSR_KVM_ASYNC_PF_EN,
(pa | KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT) & 0xffffffff, pa >> 32) < 0)
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
__this_cpu_write(apf_reason.enabled, 1);
printk(KERN_INFO"KVM setup async PF for cpu %d\n",
smp_processor_id());
......
......@@ -106,14 +106,27 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
return 0;
}
static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id)
static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
{
struct kvm_vcpu *vcpu = NULL;
int i;
if (vpidx < KVM_MAX_VCPUS)
vcpu = kvm_get_vcpu(kvm, vpidx);
if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
return vcpu;
kvm_for_each_vcpu(i, vcpu, kvm)
if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
return vcpu;
return NULL;
}
static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx)
{
struct kvm_vcpu *vcpu;
struct kvm_vcpu_hv_synic *synic;
if (vcpu_id >= atomic_read(&kvm->online_vcpus))
return NULL;
vcpu = kvm_get_vcpu(kvm, vcpu_id);
vcpu = get_vcpu_by_vpidx(kvm, vpidx);
if (!vcpu)
return NULL;
synic = vcpu_to_synic(vcpu);
......@@ -221,7 +234,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
synic->version = data;
break;
case HV_X64_MSR_SIEFP:
if (data & HV_SYNIC_SIEFP_ENABLE)
if ((data & HV_SYNIC_SIEFP_ENABLE) && !host &&
!synic->dont_zero_synic_pages)
if (kvm_clear_guest(vcpu->kvm,
data & PAGE_MASK, PAGE_SIZE)) {
ret = 1;
......@@ -232,7 +246,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
synic_exit(synic, msr);
break;
case HV_X64_MSR_SIMP:
if (data & HV_SYNIC_SIMP_ENABLE)
if ((data & HV_SYNIC_SIMP_ENABLE) && !host &&
!synic->dont_zero_synic_pages)
if (kvm_clear_guest(vcpu->kvm,
data & PAGE_MASK, PAGE_SIZE)) {
ret = 1;
......@@ -318,11 +333,11 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
return ret;
}
int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint)
int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
{
struct kvm_vcpu_hv_synic *synic;
synic = synic_get(kvm, vcpu_id);
synic = synic_get(kvm, vpidx);
if (!synic)
return -EINVAL;
......@@ -341,11 +356,11 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
kvm_hv_notify_acked_sint(vcpu, i);
}
static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi)
static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi)
{
struct kvm_vcpu_hv_synic *synic;
synic = synic_get(kvm, vcpu_id);
synic = synic_get(kvm, vpidx);
if (!synic)
return -EINVAL;
......@@ -687,14 +702,24 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
stimer_init(&hv_vcpu->stimer[i], i);
}
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu)
void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu);
}
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
{
struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
/*
* Hyper-V SynIC auto EOI SINT's are
* not compatible with APICV, so deactivate APICV
*/
kvm_vcpu_deactivate_apicv(vcpu);
vcpu_to_synic(vcpu)->active = true;
synic->active = true;
synic->dont_zero_synic_pages = dont_zero_synic_pages;
return 0;
}
......@@ -978,6 +1003,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
switch (msr) {
case HV_X64_MSR_VP_INDEX:
if (!host)
return 1;
hv->vp_index = (u32)data;
break;
case HV_X64_MSR_APIC_ASSIST_PAGE: {
u64 gfn;
unsigned long addr;
......@@ -1089,18 +1119,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
switch (msr) {
case HV_X64_MSR_VP_INDEX: {
int r;
struct kvm_vcpu *v;
kvm_for_each_vcpu(r, v, vcpu->kvm) {
if (v == vcpu) {
data = r;
break;
}
}
case HV_X64_MSR_VP_INDEX:
data = hv->vp_index;
break;
}
case HV_X64_MSR_EOI:
return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
case HV_X64_MSR_ICR:
......
......@@ -56,9 +56,10 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu);
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu);
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
......
......@@ -724,8 +724,10 @@ void kvm_free_pit(struct kvm *kvm)
struct kvm_pit *pit = kvm->arch.vpit;
if (pit) {
mutex_lock(&kvm->slots_lock);
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
mutex_unlock(&kvm->slots_lock);
kvm_pit_set_reinject(pit, false);
hrtimer_cancel(&pit->pit_state.timer);
kthread_destroy_worker(pit->worker);
......
......@@ -46,6 +46,7 @@
#include <asm/io.h>
#include <asm/vmx.h>
#include <asm/kvm_page_track.h>
#include "trace.h"
/*
* When setting this variable to true it enables Two-Dimensional-Paging
......@@ -3748,7 +3749,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
kvm_event_needs_reinjection(vcpu)))
return false;
if (is_guest_mode(vcpu))
if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
return false;
return kvm_x86_ops->interrupt_allowed(vcpu);
......@@ -3780,6 +3781,38 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
return false;
}
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len,
bool need_unprotect)
{
int r = 1;
switch (vcpu->arch.apf.host_apf_reason) {
default:
trace_kvm_page_fault(fault_address, error_code);
if (need_unprotect && kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, fault_address);
r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
insn_len);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
vcpu->arch.apf.host_apf_reason = 0;
local_irq_disable();
kvm_async_pf_task_wait(fault_address);
local_irq_enable();
break;
case KVM_PV_REASON_PAGE_READY:
vcpu->arch.apf.host_apf_reason = 0;
local_irq_disable();
kvm_async_pf_task_wake(fault_address);
local_irq_enable();
break;
}
return r;
}
EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
static bool
check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
{
......
......@@ -77,6 +77,9 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len,
bool need_unprotect);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
{
......
......@@ -194,7 +194,6 @@ struct vcpu_svm {
unsigned int3_injected;
unsigned long int3_rip;
u32 apf_reason;
/* cached guest cpuid flags for faster access */
bool nrips_enabled : 1;
......@@ -277,6 +276,10 @@ static int avic;
module_param(avic, int, S_IRUGO);
#endif
/* enable/disable Virtual VMLOAD VMSAVE */
static int vls = true;
module_param(vls, int, 0444);
/* AVIC VM ID bit masks and lock */
static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
static DEFINE_SPINLOCK(avic_vm_id_lock);
......@@ -633,11 +636,13 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
svm_set_interrupt_shadow(vcpu, 0);
}
static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject)
static void svm_queue_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
unsigned nr = vcpu->arch.exception.nr;
bool has_error_code = vcpu->arch.exception.has_error_code;
bool reinject = vcpu->arch.exception.reinject;
u32 error_code = vcpu->arch.exception.error_code;
/*
* If we are within a nested VM we'd better #VMEXIT and let the guest
......@@ -947,7 +952,7 @@ static void svm_enable_lbrv(struct vcpu_svm *svm)
{
u32 *msrpm = svm->msrpm;
svm->vmcb->control.lbr_ctl = 1;
svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
......@@ -958,7 +963,7 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
{
u32 *msrpm = svm->msrpm;
svm->vmcb->control.lbr_ctl = 0;
svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
......@@ -1093,6 +1098,16 @@ static __init int svm_hardware_setup(void)
}
}
if (vls) {
if (!npt_enabled ||
!boot_cpu_has(X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE) ||
!IS_ENABLED(CONFIG_X86_64)) {
vls = false;
} else {
pr_info("Virtual VMLOAD VMSAVE supported\n");
}
}
return 0;
err:
......@@ -1280,6 +1295,16 @@ static void init_vmcb(struct vcpu_svm *svm)
if (avic)
avic_init_vmcb(svm);
/*
* If hardware supports Virtual VMLOAD VMSAVE then enable it
* in VMCB and clear intercepts to avoid #VMEXIT.
*/
if (vls) {
clr_intercept(svm, INTERCEPT_VMLOAD);
clr_intercept(svm, INTERCEPT_VMSAVE);
svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
}
mark_all_dirty(svm->vmcb);
enable_gif(svm);
......@@ -2096,34 +2121,11 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
static int pf_interception(struct vcpu_svm *svm)
{
u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code;
int r = 1;
u64 error_code = svm->vmcb->control.exit_info_1;
switch (svm->apf_reason) {
default:
error_code = svm->vmcb->control.exit_info_1;
trace_kvm_page_fault(fault_address, error_code);
if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
svm->vmcb->control.insn_bytes,
svm->vmcb->control.insn_len);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
svm->apf_reason = 0;
local_irq_disable();
kvm_async_pf_task_wait(fault_address);
local_irq_enable();
break;
case KVM_PV_REASON_PAGE_READY:
svm->apf_reason = 0;
local_irq_disable();
kvm_async_pf_task_wake(fault_address);
local_irq_enable();
break;
}
return r;
svm->vmcb->control.insn_len, !npt_enabled);
}
static int db_interception(struct vcpu_svm *svm)
......@@ -2267,7 +2269,7 @@ static int io_interception(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
int size, in, string;
int size, in, string, ret;
unsigned port;
++svm->vcpu.stat.io_exits;
......@@ -2279,10 +2281,16 @@ static int io_interception(struct vcpu_svm *svm)
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
svm->next_rip = svm->vmcb->control.exit_info_2;
skip_emulated_instruction(&svm->vcpu);
ret = kvm_skip_emulated_instruction(&svm->vcpu);
return in ? kvm_fast_pio_in(vcpu, size, port)
: kvm_fast_pio_out(vcpu, size, port);
/*
* TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
* KVM_EXIT_DEBUG here.
*/
if (in)
return kvm_fast_pio_in(vcpu, size, port) && ret;
else
return kvm_fast_pio_out(vcpu, size, port) && ret;
}
static int nmi_interception(struct vcpu_svm *svm)
......@@ -2415,15 +2423,19 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
if (!is_guest_mode(&svm->vcpu))
return 0;
vmexit = nested_svm_intercept(svm);
if (vmexit != NESTED_EXIT_DONE)
return 0;
svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
svm->vmcb->control.exit_code_hi = 0;
svm->vmcb->control.exit_info_1 = error_code;
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
vmexit = nested_svm_intercept(svm);
if (vmexit == NESTED_EXIT_DONE)
svm->nested.exit_required = true;
if (svm->vcpu.arch.exception.nested_apf)
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
else
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
svm->nested.exit_required = true;
return vmexit;
}
......@@ -2598,7 +2610,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
break;
case SVM_EXIT_EXCP_BASE + PF_VECTOR:
/* When we're shadowing, trap PFs, but not async PF */
if (!npt_enabled && svm->apf_reason == 0)
if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0)
return NESTED_EXIT_HOST;
break;
default:
......@@ -2645,7 +2657,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
}
/* async page fault always cause vmexit */
else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
svm->apf_reason != 0)
svm->vcpu.arch.exception.nested_apf != 0)
vmexit = NESTED_EXIT_DONE;
break;
}
......@@ -2702,7 +2714,7 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
dst->event_inj = from->event_inj;
dst->event_inj_err = from->event_inj_err;
dst->nested_cr3 = from->nested_cr3;
dst->lbr_ctl = from->lbr_ctl;
dst->virt_ext = from->virt_ext;
}
static int nested_svm_vmexit(struct vcpu_svm *svm)
......@@ -3008,7 +3020,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
/* We don't want to see VMMCALLs from a nested guest */
clr_intercept(svm, INTERCEPT_VMMCALL);
svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
svm->vmcb->control.int_state = nested_vmcb->control.int_state;
svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
......@@ -3055,6 +3067,7 @@ static int vmload_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
struct page *page;
int ret;
if (nested_svm_check_permissions(svm))
return 1;
......@@ -3064,18 +3077,19 @@ static int vmload_interception(struct vcpu_svm *svm)
return 1;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
ret = kvm_skip_emulated_instruction(&svm->vcpu);
nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
nested_svm_unmap(page);
return 1;
return ret;
}
static int vmsave_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
struct page *page;
int ret;
if (nested_svm_check_permissions(svm))
return 1;
......@@ -3085,12 +3099,12 @@ static int vmsave_interception(struct vcpu_svm *svm)
return 1;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
ret = kvm_skip_emulated_instruction(&svm->vcpu);
nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
nested_svm_unmap(page);
return 1;
return ret;
}
static int vmrun_interception(struct vcpu_svm *svm)
......@@ -3123,25 +3137,29 @@ static int vmrun_interception(struct vcpu_svm *svm)
static int stgi_interception(struct vcpu_svm *svm)
{
int ret;
if (nested_svm_check_permissions(svm))
return 1;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
ret = kvm_skip_emulated_instruction(&svm->vcpu);
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
enable_gif(svm);
return 1;
return ret;
}
static int clgi_interception(struct vcpu_svm *svm)
{
int ret;
if (nested_svm_check_permissions(svm))
return 1;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
ret = kvm_skip_emulated_instruction(&svm->vcpu);
disable_gif(svm);
......@@ -3152,7 +3170,7 @@ static int clgi_interception(struct vcpu_svm *svm)
mark_dirty(svm->vmcb, VMCB_INTR);
}
return 1;
return ret;
}
static int invlpga_interception(struct vcpu_svm *svm)
......@@ -3166,8 +3184,7 @@ static int invlpga_interception(struct vcpu_svm *svm)
kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
return 1;
return kvm_skip_emulated_instruction(&svm->vcpu);
}
static int skinit_interception(struct vcpu_svm *svm)
......@@ -3190,7 +3207,7 @@ static int xsetbv_interception(struct vcpu_svm *svm)
if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
return kvm_skip_emulated_instruction(&svm->vcpu);
}
return 1;
......@@ -3286,8 +3303,7 @@ static int invlpg_interception(struct vcpu_svm *svm)
return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
skip_emulated_instruction(&svm->vcpu);
return 1;
return kvm_skip_emulated_instruction(&svm->vcpu);
}
static int emulate_on_interception(struct vcpu_svm *svm)
......@@ -3437,9 +3453,7 @@ static int dr_interception(struct vcpu_svm *svm)
kvm_register_write(&svm->vcpu, reg, val);
}
skip_emulated_instruction(&svm->vcpu);
return 1;
return kvm_skip_emulated_instruction(&svm->vcpu);
}
static int cr8_write_interception(struct vcpu_svm *svm)
......@@ -3562,6 +3576,7 @@ static int rdmsr_interception(struct vcpu_svm *svm)
if (svm_get_msr(&svm->vcpu, &msr_info)) {
trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(&svm->vcpu, 0);
return 1;
} else {
trace_kvm_msr_read(ecx, msr_info.data);
......@@ -3570,9 +3585,8 @@ static int rdmsr_interception(struct vcpu_svm *svm)
kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
msr_info.data >> 32);
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
skip_emulated_instruction(&svm->vcpu);
return kvm_skip_emulated_instruction(&svm->vcpu);
}
return 1;
}
static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
......@@ -3698,11 +3712,11 @@ static int wrmsr_interception(struct vcpu_svm *svm)
if (kvm_set_msr(&svm->vcpu, &msr)) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(&svm->vcpu, 0);
return 1;
} else {
trace_kvm_msr_write(ecx, data);
skip_emulated_instruction(&svm->vcpu);
return kvm_skip_emulated_instruction(&svm->vcpu);
}
return 1;
}
static int msr_interception(struct vcpu_svm *svm)
......@@ -3731,8 +3745,7 @@ static int pause_interception(struct vcpu_svm *svm)
static int nop_interception(struct vcpu_svm *svm)
{
skip_emulated_instruction(&(svm->vcpu));
return 1;
return kvm_skip_emulated_instruction(&(svm->vcpu));
}
static int monitor_interception(struct vcpu_svm *svm)
......@@ -4117,7 +4130,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
......@@ -4965,7 +4978,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
/* if exit due to PF check for async PF */
if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
svm->apf_reason = kvm_read_and_reset_pf_reason();
svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
if (npt_enabled) {
vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
......
......@@ -2422,28 +2422,41 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
* KVM wants to inject page-faults which it got to the guest. This function
* checks whether in a nested guest, we need to inject them to L1 or L2.
*/
static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
unsigned int nr = vcpu->arch.exception.nr;
if (!(vmcs12->exception_bitmap & (1u << nr)))
if (!((vmcs12->exception_bitmap & (1u << nr)) ||
(nr == PF_VECTOR && vcpu->arch.exception.nested_apf)))
return 0;
if (vcpu->arch.exception.nested_apf) {
vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code);
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
vcpu->arch.apf.nested_apf_token);
return 1;
}
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
vmcs_read32(VM_EXIT_INTR_INFO),
vmcs_readl(EXIT_QUALIFICATION));
return 1;
}
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject)
static void vmx_queue_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned nr = vcpu->arch.exception.nr;
bool has_error_code = vcpu->arch.exception.has_error_code;
bool reinject = vcpu->arch.exception.reinject;
u32 error_code = vcpu->arch.exception.error_code;
u32 intr_info = nr | INTR_INFO_VALID_MASK;
if (!reinject && is_guest_mode(vcpu) &&
nested_vmx_check_exception(vcpu, nr))
nested_vmx_check_exception(vcpu))
return;
if (has_error_code) {
......@@ -3764,6 +3777,25 @@ static void free_kvm_area(void)
}
}
enum vmcs_field_type {
VMCS_FIELD_TYPE_U16 = 0,
VMCS_FIELD_TYPE_U64 = 1,
VMCS_FIELD_TYPE_U32 = 2,
VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
};
static inline int vmcs_field_type(unsigned long field)
{
if (0x1 & field) /* the *_HIGH fields are all 32 bit */
return VMCS_FIELD_TYPE_U32;
return (field >> 13) & 0x3 ;
}
static inline int vmcs_field_readonly(unsigned long field)
{
return (((field >> 10) & 0x3) == 1);
}
static void init_vmcs_shadow_fields(void)
{
int i, j;
......@@ -3789,14 +3821,22 @@ static void init_vmcs_shadow_fields(void)
/* shadowed fields guest access without vmexit */
for (i = 0; i < max_shadow_read_write_fields; i++) {
clear_bit(shadow_read_write_fields[i],
vmx_vmwrite_bitmap);
clear_bit(shadow_read_write_fields[i],
vmx_vmread_bitmap);
unsigned long field = shadow_read_write_fields[i];
clear_bit(field, vmx_vmwrite_bitmap);
clear_bit(field, vmx_vmread_bitmap);
if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) {
clear_bit(field + 1, vmx_vmwrite_bitmap);
clear_bit(field + 1, vmx_vmread_bitmap);
}
}
for (i = 0; i < max_shadow_read_only_fields; i++) {
unsigned long field = shadow_read_only_fields[i];
clear_bit(field, vmx_vmread_bitmap);
if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64)
clear_bit(field + 1, vmx_vmread_bitmap);
}
for (i = 0; i < max_shadow_read_only_fields; i++)
clear_bit(shadow_read_only_fields[i],
vmx_vmread_bitmap);
}
static __init int alloc_kvm_area(void)
......@@ -4634,6 +4674,11 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
return true;
}
static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
{
return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
}
static int init_rmode_tss(struct kvm *kvm)
{
gfn_t fn;
......@@ -5664,14 +5709,11 @@ static int handle_exception(struct kvm_vcpu *vcpu)
}
if (is_page_fault(intr_info)) {
/* EPT won't cause page fault directly */
BUG_ON(enable_ept);
cr2 = vmcs_readl(EXIT_QUALIFICATION);
trace_kvm_page_fault(cr2, error_code);
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, cr2);
return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
/* EPT won't cause page fault directly */
WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0,
true);
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
......@@ -7214,25 +7256,6 @@ static int handle_vmresume(struct kvm_vcpu *vcpu)
return nested_vmx_run(vcpu, false);
}
enum vmcs_field_type {
VMCS_FIELD_TYPE_U16 = 0,
VMCS_FIELD_TYPE_U64 = 1,
VMCS_FIELD_TYPE_U32 = 2,
VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
};
static inline int vmcs_field_type(unsigned long field)
{
if (0x1 & field) /* the *_HIGH fields are all 32 bit */
return VMCS_FIELD_TYPE_U32;
return (field >> 13) & 0x3 ;
}
static inline int vmcs_field_readonly(unsigned long field)
{
return (((field >> 10) & 0x3) == 1);
}
/*
* Read a vmcs12 field. Since these can have varying lengths and we return
* one type, we chose the biggest type (u64) and zero-extend the return value
......@@ -8014,7 +8037,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
if (is_nmi(intr_info))
return false;
else if (is_page_fault(intr_info))
return enable_ept;
return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
else if (is_no_device(intr_info) &&
!(vmcs12->guest_cr0 & X86_CR0_TS))
return false;
......@@ -8418,9 +8441,15 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
exit_reason != EXIT_REASON_TASK_SWITCH)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
vcpu->run->internal.ndata = 2;
vcpu->run->internal.ndata = 3;
vcpu->run->internal.data[0] = vectoring_info;
vcpu->run->internal.data[1] = exit_reason;
vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
vcpu->run->internal.ndata++;
vcpu->run->internal.data[3] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
}
return 0;
}
......@@ -8611,17 +8640,24 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
u32 exit_intr_info = 0;
u16 basic_exit_reason = (u16)vmx->exit_reason;
if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
|| vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
|| basic_exit_reason == EXIT_REASON_EXCEPTION_NMI))
return;
vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
exit_intr_info = vmx->exit_intr_info;
if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
vmx->exit_intr_info = exit_intr_info;
/* if exit due to PF check for async PF */
if (is_page_fault(exit_intr_info))
vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
/* Handle machine checks before interrupts are enabled */
if (is_machine_check(exit_intr_info))
if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY ||
is_machine_check(exit_intr_info))
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
......@@ -9589,23 +9625,26 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
}
static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
return 0;
if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
!page_address_valid(vcpu, vmcs12->io_bitmap_b))
return -EINVAL;
return 0;
}
static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
int maxphyaddr;
u64 addr;
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
return 0;
if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
WARN_ON(1);
return -EINVAL;
}
maxphyaddr = cpuid_maxphyaddr(vcpu);
if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
((addr + PAGE_SIZE) >> maxphyaddr))
if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
return -EINVAL;
return 0;
......@@ -10293,6 +10332,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
......@@ -10429,8 +10471,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
return 1;
}
vmcs12->launch_state = 1;
/*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
......@@ -10804,6 +10844,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
vmcs12->launch_state = 1;
/* vm_entry_intr_info_field is cleared on exit. Emulate this
* instead of reading the real value. */
vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
......
......@@ -134,8 +134,6 @@ module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);
static bool __read_mostly backwards_tsc_observed = false;
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
......@@ -452,7 +450,12 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
++vcpu->stat.pf_guest;
vcpu->arch.cr2 = fault->address;
vcpu->arch.exception.nested_apf =
is_guest_mode(vcpu) && fault->async_page_fault;
if (vcpu->arch.exception.nested_apf)
vcpu->arch.apf.nested_apf_token = fault->address;
else
vcpu->arch.cr2 = fault->address;
kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
}
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
......@@ -1719,7 +1722,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
&ka->master_cycle_now);
ka->use_master_clock = host_tsc_clocksource && vcpus_matched
&& !backwards_tsc_observed
&& !ka->backwards_tsc_observed
&& !ka->boot_vcpu_runs_old_kvmclock;
if (ka->use_master_clock)
......@@ -2060,8 +2063,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
/* Bits 2:5 are reserved, Should be zero */
if (data & 0x3c)
/* Bits 3:5 are reserved, Should be zero */
if (data & 0x38)
return 1;
vcpu->arch.apf.msr_val = data;
......@@ -2077,6 +2080,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
return 1;
vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
kvm_async_pf_wakeup_all(vcpu);
return 0;
}
......@@ -2661,6 +2665,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_VAPIC:
case KVM_CAP_HYPERV_SPIN:
case KVM_CAP_HYPERV_SYNIC:
case KVM_CAP_HYPERV_SYNIC2:
case KVM_CAP_HYPERV_VP_INDEX:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
......@@ -3384,10 +3390,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return -EINVAL;
switch (cap->cap) {
case KVM_CAP_HYPERV_SYNIC2:
if (cap->args[0])
return -EINVAL;
case KVM_CAP_HYPERV_SYNIC:
if (!irqchip_in_kernel(vcpu->kvm))
return -EINVAL;
return kvm_hv_activate_synic(vcpu);
return kvm_hv_activate_synic(vcpu, cap->cap ==
KVM_CAP_HYPERV_SYNIC2);
default:
return -EINVAL;
}
......@@ -4188,9 +4198,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
r = 0;
/*
* TODO: userspace has to take care of races with VCPU_RUN, so
* kvm_gen_update_masterclock() can be cut down to locked
* pvclock_update_vm_gtod_copy().
*/
kvm_gen_update_masterclock(kvm);
now_ns = get_kvmclock_ns(kvm);
kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
kvm_gen_update_masterclock(kvm);
kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
break;
}
case KVM_GET_CLOCK: {
......@@ -6347,10 +6363,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
kvm_update_dr7(vcpu);
}
kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
vcpu->arch.exception.reinject);
kvm_x86_ops->queue_exception(vcpu);
return 0;
}
......@@ -7676,6 +7689,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
struct msr_data msr;
struct kvm *kvm = vcpu->kvm;
kvm_hv_vcpu_postcreate(vcpu);
if (vcpu_load(vcpu))
return;
msr.data = 0x0;
......@@ -7829,8 +7844,8 @@ int kvm_arch_hardware_enable(void)
*/
if (backwards_tsc) {
u64 delta_cyc = max_tsc - local_tsc;
backwards_tsc_observed = true;
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm->arch.backwards_tsc_observed = true;
kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu->arch.tsc_offset_adjustment += delta_cyc;
vcpu->arch.last_host_tsc = local_tsc;
......@@ -8576,6 +8591,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
fault.error_code = 0;
fault.nested_page_fault = false;
fault.address = work->arch.token;
fault.async_page_fault = true;
kvm_inject_page_fault(vcpu, &fault);
}
}
......@@ -8598,6 +8614,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
fault.error_code = 0;
fault.nested_page_fault = false;
fault.address = work->arch.token;
fault.async_page_fault = true;
kvm_inject_page_fault(vcpu, &fault);
}
vcpu->arch.apf.halted = false;
......
......@@ -234,7 +234,7 @@ struct kvm_vcpu {
int guest_fpu_loaded, guest_xcr0_loaded;
struct swait_queue_head wq;
struct pid *pid;
struct pid __rcu *pid;
int sigset_active;
sigset_t sigset;
struct kvm_vcpu_stat stat;
......@@ -390,7 +390,7 @@ struct kvm {
spinlock_t mmu_lock;
struct mutex slots_lock;
struct mm_struct *mm; /* userspace tied to this vm */
struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM];
struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
/*
......@@ -404,7 +404,7 @@ struct kvm {
int last_boosted_vcpu;
struct list_head vm_list;
struct mutex lock;
struct kvm_io_bus *buses[KVM_NR_BUSES];
struct kvm_io_bus __rcu *buses[KVM_NR_BUSES];
#ifdef CONFIG_HAVE_KVM_EVENTFD
struct {
spinlock_t lock;
......@@ -473,6 +473,12 @@ struct kvm {
#define vcpu_err(vcpu, fmt, ...) \
kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx)
{
return srcu_dereference_check(kvm->buses[idx], &kvm->srcu,
lockdep_is_held(&kvm->slots_lock));
}
static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
{
/* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu, in case
......@@ -562,9 +568,8 @@ void kvm_put_kvm(struct kvm *kvm);
static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
{
return rcu_dereference_check(kvm->memslots[as_id],
srcu_read_lock_held(&kvm->srcu)
|| lockdep_is_held(&kvm->slots_lock));
return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu,
lockdep_is_held(&kvm->slots_lock));
}
static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
......
......@@ -927,6 +927,8 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_S390_CMMA_MIGRATION 145
#define KVM_CAP_PPC_FWNMI 146
#define KVM_CAP_PPC_SMT_POSSIBLE 147
#define KVM_CAP_HYPERV_SYNIC2 148
#define KVM_CAP_HYPERV_VP_INDEX 149
#ifdef KVM_CAP_IRQ_ROUTING
......@@ -1351,7 +1353,7 @@ struct kvm_s390_ucas_mapping {
/* Available with KVM_CAP_X86_SMM */
#define KVM_SMI _IO(KVMIO, 0xb7)
/* Available with KVM_CAP_S390_CMMA_MIGRATION */
#define KVM_S390_GET_CMMA_BITS _IOW(KVMIO, 0xb8, struct kvm_s390_cmma_log)
#define KVM_S390_GET_CMMA_BITS _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log)
#define KVM_S390_SET_CMMA_BITS _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log)
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
......
......@@ -825,7 +825,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
if (ret < 0)
goto unlock_fail;
kvm->buses[bus_idx]->ioeventfd_count++;
kvm_get_bus(kvm, bus_idx)->ioeventfd_count++;
list_add_tail(&p->list, &kvm->ioeventfds);
mutex_unlock(&kvm->slots_lock);
......@@ -848,6 +848,7 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
{
struct _ioeventfd *p, *tmp;
struct eventfd_ctx *eventfd;
struct kvm_io_bus *bus;
int ret = -ENOENT;
eventfd = eventfd_ctx_fdget(args->fd);
......@@ -870,8 +871,9 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
continue;
kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
if (kvm->buses[bus_idx])
kvm->buses[bus_idx]->ioeventfd_count--;
bus = kvm_get_bus(kvm, bus_idx);
if (bus)
bus->ioeventfd_count--;
ioeventfd_release(p);
ret = 0;
break;
......
......@@ -230,7 +230,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
}
mutex_lock(&kvm->irq_lock);
old = kvm->irq_routing;
old = rcu_dereference_protected(kvm->irq_routing, 1);
rcu_assign_pointer(kvm->irq_routing, new);
kvm_irq_routing_update(kvm);
kvm_arch_irq_routing_update(kvm);
......
......@@ -130,6 +130,12 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
static bool largepages_enabled = true;
#define KVM_EVENT_CREATE_VM 0
#define KVM_EVENT_DESTROY_VM 1
static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
static unsigned long long kvm_createvm_count;
static unsigned long long kvm_active_vms;
bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
{
if (pfn_valid(pfn))
......@@ -187,12 +193,23 @@ static void ack_flush(void *_completed)
{
}
static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
{
if (unlikely(!cpus))
cpus = cpu_online_mask;
if (cpumask_empty(cpus))
return false;
smp_call_function_many(cpus, ack_flush, NULL, wait);
return true;
}
bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
{
int i, cpu, me;
cpumask_var_t cpus;
bool called = true;
bool wait = req & KVM_REQUEST_WAIT;
bool called;
struct kvm_vcpu *vcpu;
zalloc_cpumask_var(&cpus, GFP_ATOMIC);
......@@ -207,14 +224,9 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
if (cpus != NULL && cpu != -1 && cpu != me &&
kvm_request_needs_ipi(vcpu, req))
cpumask_set_cpu(cpu, cpus);
__cpumask_set_cpu(cpu, cpus);
}
if (unlikely(cpus == NULL))
smp_call_function_many(cpu_online_mask, ack_flush, NULL, wait);
else if (!cpumask_empty(cpus))
smp_call_function_many(cpus, ack_flush, NULL, wait);
else
called = false;
called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
put_cpu();
free_cpumask_var(cpus);
return called;
......@@ -293,7 +305,12 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_init);
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
{
put_pid(vcpu->pid);
/*
* no need for rcu_read_lock as VCPU_RUN is the only place that
* will change the vcpu->pid pointer and on uninit all file
* descriptors are already gone.
*/
put_pid(rcu_dereference_protected(vcpu->pid, 1));
kvm_arch_vcpu_uninit(vcpu);
free_page((unsigned long)vcpu->run);
}
......@@ -674,8 +691,8 @@ static struct kvm *kvm_create_vm(unsigned long type)
if (init_srcu_struct(&kvm->irq_srcu))
goto out_err_no_irq_srcu;
for (i = 0; i < KVM_NR_BUSES; i++) {
kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
GFP_KERNEL);
rcu_assign_pointer(kvm->buses[i],
kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
if (!kvm->buses[i])
goto out_err;
}
......@@ -700,9 +717,10 @@ static struct kvm *kvm_create_vm(unsigned long type)
hardware_disable_all();
out_err_no_disable:
for (i = 0; i < KVM_NR_BUSES; i++)
kfree(kvm->buses[i]);
kfree(rcu_access_pointer(kvm->buses[i]));
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
kvm_free_memslots(kvm, kvm->memslots[i]);
kvm_free_memslots(kvm,
rcu_dereference_protected(kvm->memslots[i], 1));
kvm_arch_free_vm(kvm);
mmdrop(current->mm);
return ERR_PTR(r);
......@@ -728,6 +746,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
int i;
struct mm_struct *mm = kvm->mm;
kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
kvm_destroy_vm_debugfs(kvm);
kvm_arch_sync_events(kvm);
spin_lock(&kvm_lock);
......@@ -735,8 +754,11 @@ static void kvm_destroy_vm(struct kvm *kvm)
spin_unlock(&kvm_lock);
kvm_free_irq_routing(kvm);
for (i = 0; i < KVM_NR_BUSES; i++) {
if (kvm->buses[i])
kvm_io_bus_destroy(kvm->buses[i]);
struct kvm_io_bus *bus;
bus = rcu_dereference_protected(kvm->buses[i], 1);
if (bus)
kvm_io_bus_destroy(bus);
kvm->buses[i] = NULL;
}
kvm_coalesced_mmio_free(kvm);
......@@ -748,7 +770,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm_arch_destroy_vm(kvm);
kvm_destroy_devices(kvm);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
kvm_free_memslots(kvm, kvm->memslots[i]);
kvm_free_memslots(kvm,
rcu_dereference_protected(kvm->memslots[i], 1));
cleanup_srcu_struct(&kvm->irq_srcu);
cleanup_srcu_struct(&kvm->srcu);
kvm_arch_free_vm(kvm);
......@@ -2551,13 +2574,14 @@ static long kvm_vcpu_ioctl(struct file *filp,
if (r)
return r;
switch (ioctl) {
case KVM_RUN:
case KVM_RUN: {
struct pid *oldpid;
r = -EINVAL;
if (arg)
goto out;
if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
oldpid = rcu_access_pointer(vcpu->pid);
if (unlikely(oldpid != current->pids[PIDTYPE_PID].pid)) {
/* The thread running this VCPU changed. */
struct pid *oldpid = vcpu->pid;
struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
rcu_assign_pointer(vcpu->pid, newpid);
......@@ -2568,6 +2592,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
}
case KVM_GET_REGS: {
struct kvm_regs *kvm_regs;
......@@ -3202,6 +3227,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
fput(file);
return -ENOMEM;
}
kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
fd_install(r, file);
return r;
......@@ -3563,7 +3589,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
{
struct kvm_io_bus *new_bus, *bus;
bus = kvm->buses[bus_idx];
bus = kvm_get_bus(kvm, bus_idx);
if (!bus)
return -ENOMEM;
......@@ -3592,7 +3618,7 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
int i;
struct kvm_io_bus *new_bus, *bus;
bus = kvm->buses[bus_idx];
bus = kvm_get_bus(kvm, bus_idx);
if (!bus)
return;
......@@ -3854,6 +3880,67 @@ static const struct file_operations *stat_fops[] = {
[KVM_STAT_VM] = &vm_stat_fops,
};
static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
{
struct kobj_uevent_env *env;
char *tmp, *pathbuf = NULL;
unsigned long long created, active;
if (!kvm_dev.this_device || !kvm)
return;
spin_lock(&kvm_lock);
if (type == KVM_EVENT_CREATE_VM) {
kvm_createvm_count++;
kvm_active_vms++;
} else if (type == KVM_EVENT_DESTROY_VM) {
kvm_active_vms--;
}
created = kvm_createvm_count;
active = kvm_active_vms;
spin_unlock(&kvm_lock);
env = kzalloc(sizeof(*env), GFP_KERNEL);
if (!env)
return;
add_uevent_var(env, "CREATED=%llu", created);
add_uevent_var(env, "COUNT=%llu", active);
if (type == KVM_EVENT_CREATE_VM)
add_uevent_var(env, "EVENT=create");
else if (type == KVM_EVENT_DESTROY_VM)
add_uevent_var(env, "EVENT=destroy");
if (kvm->debugfs_dentry) {
char p[ITOA_MAX_LEN];
snprintf(p, sizeof(p), "%s", kvm->debugfs_dentry->d_name.name);
tmp = strchrnul(p + 1, '-');
*tmp = '\0';
add_uevent_var(env, "PID=%s", p);
pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
if (pathbuf) {
/* sizeof counts the final '\0' */
int len = sizeof("STATS_PATH=") - 1;
const char *pvar = "STATS_PATH=";
tmp = dentry_path_raw(kvm->debugfs_dentry,
pathbuf + len,
PATH_MAX - len);
if (!IS_ERR(tmp)) {
memcpy(tmp - len, pvar, len);
env->envp[env->envp_idx++] = tmp - len;
}
}
}
/* no need for checks, since we are adding at most only 5 keys */
env->envp[env->envp_idx++] = NULL;
kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
kfree(env);
kfree(pathbuf);
}
static int kvm_init_debug(void)
{
int r = -EEXIST;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册