提交 08d19f51 编写于 作者: L Linus Torvalds

Merge branch 'kvm-updates/2.6.28' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'kvm-updates/2.6.28' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (134 commits)
  KVM: ia64: Add intel iommu support for guests.
  KVM: ia64: add directed mmio range support for kvm guests
  KVM: ia64: Make pmt table be able to hold physical mmio entries.
  KVM: Move irqchip_in_kernel() from ioapic.h to irq.h
  KVM: Separate irq ack notification out of arch/x86/kvm/irq.c
  KVM: Change is_mmio_pfn to kvm_is_mmio_pfn, and make it common for all archs
  KVM: Move device assignment logic to common code
  KVM: Device Assignment: Move vtd.c from arch/x86/kvm/ to virt/kvm/
  KVM: VMX: enable invlpg exiting if EPT is disabled
  KVM: x86: Silence various LAPIC-related host kernel messages
  KVM: Device Assignment: Map mmio pages into VT-d page table
  KVM: PIC: enhance IPI avoidance
  KVM: MMU: add "oos_shadow" parameter to disable oos
  KVM: MMU: speed up mmu_unsync_walk
  KVM: MMU: out of sync shadow core
  KVM: MMU: mmu_convert_notrap helper
  KVM: MMU: awareness of new kvm_mmu_zap_page behaviour
  KVM: MMU: mmu_parent_walk
  KVM: x86: trap invlpg
  KVM: MMU: sync roots on mmu reload
  ...
...@@ -2448,7 +2448,14 @@ S: Supported ...@@ -2448,7 +2448,14 @@ S: Supported
KERNEL VIRTUAL MACHINE (KVM) KERNEL VIRTUAL MACHINE (KVM)
P: Avi Kivity P: Avi Kivity
M: avi@qumranet.com M: avi@redhat.com
L: kvm@vger.kernel.org
W: http://kvm.qumranet.com
S: Supported
KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V
P: Joerg Roedel
M: joerg.roedel@amd.com
L: kvm@vger.kernel.org L: kvm@vger.kernel.org
W: http://kvm.qumranet.com W: http://kvm.qumranet.com
S: Supported S: Supported
......
...@@ -132,7 +132,7 @@ ...@@ -132,7 +132,7 @@
#define GPFN_IOSAPIC (4UL << 60) /* IOSAPIC base */ #define GPFN_IOSAPIC (4UL << 60) /* IOSAPIC base */
#define GPFN_LEGACY_IO (5UL << 60) /* Legacy I/O base */ #define GPFN_LEGACY_IO (5UL << 60) /* Legacy I/O base */
#define GPFN_GFW (6UL << 60) /* Guest Firmware */ #define GPFN_GFW (6UL << 60) /* Guest Firmware */
#define GPFN_HIGH_MMIO (7UL << 60) /* High MMIO range */ #define GPFN_PHYS_MMIO (7UL << 60) /* Directed MMIO Range */
#define GPFN_IO_MASK (7UL << 60) /* Guest pfn is I/O type */ #define GPFN_IO_MASK (7UL << 60) /* Guest pfn is I/O type */
#define GPFN_INV_MASK (1UL << 63) /* Guest pfn is invalid */ #define GPFN_INV_MASK (1UL << 63) /* Guest pfn is invalid */
...@@ -413,6 +413,10 @@ struct kvm_arch { ...@@ -413,6 +413,10 @@ struct kvm_arch {
struct kvm_ioapic *vioapic; struct kvm_ioapic *vioapic;
struct kvm_vm_stat stat; struct kvm_vm_stat stat;
struct kvm_sal_data rdv_sal_data; struct kvm_sal_data rdv_sal_data;
struct list_head assigned_dev_head;
struct dmar_domain *intel_iommu_domain;
struct hlist_head irq_ack_notifier_list;
}; };
union cpuid3_t { union cpuid3_t {
......
...@@ -46,4 +46,6 @@ config KVM_INTEL ...@@ -46,4 +46,6 @@ config KVM_INTEL
config KVM_TRACE config KVM_TRACE
bool bool
source drivers/virtio/Kconfig
endif # VIRTUALIZATION endif # VIRTUALIZATION
...@@ -44,7 +44,11 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/ ...@@ -44,7 +44,11 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/ EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
coalesced_mmio.o) coalesced_mmio.o irq_comm.o)
ifeq ($(CONFIG_DMAR),y)
common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
endif
kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM) += kvm.o
......
/*
* irq.h: In-kernel interrupt controller related definitions
* Copyright (c) 2008, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 59 Temple
* Place - Suite 330, Boston, MA 02111-1307 USA.
*
* Authors:
* Xiantao Zhang <xiantao.zhang@intel.com>
*
*/
#ifndef __IRQ_H
#define __IRQ_H
static inline int irqchip_in_kernel(struct kvm *kvm)
{
return 1;
}
#endif
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include <linux/bitops.h> #include <linux/bitops.h>
#include <linux/hrtimer.h> #include <linux/hrtimer.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/intel-iommu.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <asm/gcc_intrin.h> #include <asm/gcc_intrin.h>
...@@ -45,6 +46,7 @@ ...@@ -45,6 +46,7 @@
#include "iodev.h" #include "iodev.h"
#include "ioapic.h" #include "ioapic.h"
#include "lapic.h" #include "lapic.h"
#include "irq.h"
static unsigned long kvm_vmm_base; static unsigned long kvm_vmm_base;
static unsigned long kvm_vsa_base; static unsigned long kvm_vsa_base;
...@@ -179,12 +181,16 @@ int kvm_dev_ioctl_check_extension(long ext) ...@@ -179,12 +181,16 @@ int kvm_dev_ioctl_check_extension(long ext)
switch (ext) { switch (ext) {
case KVM_CAP_IRQCHIP: case KVM_CAP_IRQCHIP:
case KVM_CAP_USER_MEMORY: case KVM_CAP_USER_MEMORY:
case KVM_CAP_MP_STATE:
r = 1; r = 1;
break; break;
case KVM_CAP_COALESCED_MMIO: case KVM_CAP_COALESCED_MMIO:
r = KVM_COALESCED_MMIO_PAGE_OFFSET; r = KVM_COALESCED_MMIO_PAGE_OFFSET;
break; break;
case KVM_CAP_IOMMU:
r = intel_iommu_found();
break;
default: default:
r = 0; r = 0;
} }
...@@ -771,6 +777,7 @@ static void kvm_init_vm(struct kvm *kvm) ...@@ -771,6 +777,7 @@ static void kvm_init_vm(struct kvm *kvm)
*/ */
kvm_build_io_pmt(kvm); kvm_build_io_pmt(kvm);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
} }
struct kvm *kvm_arch_create_vm(void) struct kvm *kvm_arch_create_vm(void)
...@@ -1334,6 +1341,10 @@ static void kvm_release_vm_pages(struct kvm *kvm) ...@@ -1334,6 +1341,10 @@ static void kvm_release_vm_pages(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm)
{ {
kvm_iommu_unmap_guest(kvm);
#ifdef KVM_CAP_DEVICE_ASSIGNMENT
kvm_free_all_assigned_devices(kvm);
#endif
kfree(kvm->arch.vioapic); kfree(kvm->arch.vioapic);
kvm_release_vm_pages(kvm); kvm_release_vm_pages(kvm);
kvm_free_physmem(kvm); kvm_free_physmem(kvm);
...@@ -1435,17 +1446,24 @@ int kvm_arch_set_memory_region(struct kvm *kvm, ...@@ -1435,17 +1446,24 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
int user_alloc) int user_alloc)
{ {
unsigned long i; unsigned long i;
struct page *page; unsigned long pfn;
int npages = mem->memory_size >> PAGE_SHIFT; int npages = mem->memory_size >> PAGE_SHIFT;
struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
unsigned long base_gfn = memslot->base_gfn; unsigned long base_gfn = memslot->base_gfn;
for (i = 0; i < npages; i++) { for (i = 0; i < npages; i++) {
page = gfn_to_page(kvm, base_gfn + i); pfn = gfn_to_pfn(kvm, base_gfn + i);
if (!kvm_is_mmio_pfn(pfn)) {
kvm_set_pmt_entry(kvm, base_gfn + i,
pfn << PAGE_SHIFT,
_PAGE_AR_RWX | _PAGE_MA_WB);
memslot->rmap[i] = (unsigned long)pfn_to_page(pfn);
} else {
kvm_set_pmt_entry(kvm, base_gfn + i, kvm_set_pmt_entry(kvm, base_gfn + i,
page_to_pfn(page) << PAGE_SHIFT, GPFN_PHYS_MMIO | (pfn << PAGE_SHIFT),
_PAGE_AR_RWX|_PAGE_MA_WB); _PAGE_MA_UC);
memslot->rmap[i] = (unsigned long)page; memslot->rmap[i] = 0;
}
} }
return 0; return 0;
...@@ -1789,11 +1807,43 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) ...@@ -1789,11 +1807,43 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state) struct kvm_mp_state *mp_state)
{ {
return -EINVAL; vcpu_load(vcpu);
mp_state->mp_state = vcpu->arch.mp_state;
vcpu_put(vcpu);
return 0;
}
static int vcpu_reset(struct kvm_vcpu *vcpu)
{
int r;
long psr;
local_irq_save(psr);
r = kvm_insert_vmm_mapping(vcpu);
if (r)
goto fail;
vcpu->arch.launched = 0;
kvm_arch_vcpu_uninit(vcpu);
r = kvm_arch_vcpu_init(vcpu);
if (r)
goto fail;
kvm_purge_vmm_mapping(vcpu);
r = 0;
fail:
local_irq_restore(psr);
return r;
} }
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state) struct kvm_mp_state *mp_state)
{ {
return -EINVAL; int r = 0;
vcpu_load(vcpu);
vcpu->arch.mp_state = mp_state->mp_state;
if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
r = vcpu_reset(vcpu);
vcpu_put(vcpu);
return r;
} }
...@@ -50,27 +50,18 @@ ...@@ -50,27 +50,18 @@
#define PAL_VSA_SYNC_READ \ #define PAL_VSA_SYNC_READ \
/* begin to call pal vps sync_read */ \ /* begin to call pal vps sync_read */ \
{.mii; \
add r25 = VMM_VPD_BASE_OFFSET, r21; \ add r25 = VMM_VPD_BASE_OFFSET, r21; \
adds r20 = VMM_VCPU_VSA_BASE_OFFSET, r21; /* entry point */ \ nop 0x0; \
mov r24=ip; \
;; \ ;; \
} \
{.mmb \
add r24=0x20, r24; \
ld8 r25 = [r25]; /* read vpd base */ \ ld8 r25 = [r25]; /* read vpd base */ \
ld8 r20 = [r20]; \ br.cond.sptk kvm_vps_sync_read; /*call the service*/ \
;; \
add r20 = PAL_VPS_SYNC_READ,r20; \
;; \
{ .mii; \
nop 0x0; \
mov r24 = ip; \
mov b0 = r20; \
;; \ ;; \
}; \ }; \
{ .mmb; \
add r24 = 0x20, r24; \
nop 0x0; \
br.cond.sptk b0; /* call the service */ \
;; \
};
#define KVM_MINSTATE_GET_CURRENT(reg) mov reg=r21 #define KVM_MINSTATE_GET_CURRENT(reg) mov reg=r21
......
/* /*
* arch/ia64/vmx/optvfault.S * arch/ia64/kvm/optvfault.S
* optimize virtualization fault handler * optimize virtualization fault handler
* *
* Copyright (C) 2006 Intel Co * Copyright (C) 2006 Intel Co
* Xuefei Xu (Anthony Xu) <anthony.xu@intel.com> * Xuefei Xu (Anthony Xu) <anthony.xu@intel.com>
* Copyright (C) 2008 Intel Co
* Add the support for Tukwila processors.
* Xiantao Zhang <xiantao.zhang@intel.com>
*/ */
#include <asm/asmmacro.h> #include <asm/asmmacro.h>
...@@ -20,6 +23,98 @@ ...@@ -20,6 +23,98 @@
#define ACCE_MOV_TO_PSR #define ACCE_MOV_TO_PSR
#define ACCE_THASH #define ACCE_THASH
#define VMX_VPS_SYNC_READ \
add r16=VMM_VPD_BASE_OFFSET,r21; \
mov r17 = b0; \
mov r18 = r24; \
mov r19 = r25; \
mov r20 = r31; \
;; \
{.mii; \
ld8 r16 = [r16]; \
nop 0x0; \
mov r24 = ip; \
;; \
}; \
{.mmb; \
add r24=0x20, r24; \
mov r25 =r16; \
br.sptk.many kvm_vps_sync_read; \
}; \
mov b0 = r17; \
mov r24 = r18; \
mov r25 = r19; \
mov r31 = r20
ENTRY(kvm_vps_entry)
adds r29 = VMM_VCPU_VSA_BASE_OFFSET,r21
;;
ld8 r29 = [r29]
;;
add r29 = r29, r30
;;
mov b0 = r29
br.sptk.many b0
END(kvm_vps_entry)
/*
* Inputs:
* r24 : return address
* r25 : vpd
* r29 : scratch
*
*/
GLOBAL_ENTRY(kvm_vps_sync_read)
movl r30 = PAL_VPS_SYNC_READ
;;
br.sptk.many kvm_vps_entry
END(kvm_vps_sync_read)
/*
* Inputs:
* r24 : return address
* r25 : vpd
* r29 : scratch
*
*/
GLOBAL_ENTRY(kvm_vps_sync_write)
movl r30 = PAL_VPS_SYNC_WRITE
;;
br.sptk.many kvm_vps_entry
END(kvm_vps_sync_write)
/*
* Inputs:
* r23 : pr
* r24 : guest b0
* r25 : vpd
*
*/
GLOBAL_ENTRY(kvm_vps_resume_normal)
movl r30 = PAL_VPS_RESUME_NORMAL
;;
mov pr=r23,-2
br.sptk.many kvm_vps_entry
END(kvm_vps_resume_normal)
/*
* Inputs:
* r23 : pr
* r24 : guest b0
* r25 : vpd
* r17 : isr
*/
GLOBAL_ENTRY(kvm_vps_resume_handler)
movl r30 = PAL_VPS_RESUME_HANDLER
;;
ld8 r27=[r25]
shr r17=r17,IA64_ISR_IR_BIT
;;
dep r27=r17,r27,63,1 // bit 63 of r27 indicate whether enable CFLE
mov pr=r23,-2
br.sptk.many kvm_vps_entry
END(kvm_vps_resume_handler)
//mov r1=ar3 //mov r1=ar3
GLOBAL_ENTRY(kvm_asm_mov_from_ar) GLOBAL_ENTRY(kvm_asm_mov_from_ar)
#ifndef ACCE_MOV_FROM_AR #ifndef ACCE_MOV_FROM_AR
...@@ -157,11 +252,11 @@ GLOBAL_ENTRY(kvm_asm_rsm) ...@@ -157,11 +252,11 @@ GLOBAL_ENTRY(kvm_asm_rsm)
#ifndef ACCE_RSM #ifndef ACCE_RSM
br.many kvm_virtualization_fault_back br.many kvm_virtualization_fault_back
#endif #endif
add r16=VMM_VPD_BASE_OFFSET,r21 VMX_VPS_SYNC_READ
;;
extr.u r26=r25,6,21 extr.u r26=r25,6,21
extr.u r27=r25,31,2 extr.u r27=r25,31,2
;; ;;
ld8 r16=[r16]
extr.u r28=r25,36,1 extr.u r28=r25,36,1
dep r26=r27,r26,21,2 dep r26=r27,r26,21,2
;; ;;
...@@ -196,7 +291,7 @@ GLOBAL_ENTRY(kvm_asm_rsm) ...@@ -196,7 +291,7 @@ GLOBAL_ENTRY(kvm_asm_rsm)
tbit.nz p6,p0=r23,0 tbit.nz p6,p0=r23,0
;; ;;
tbit.z.or p6,p0=r26,IA64_PSR_DT_BIT tbit.z.or p6,p0=r26,IA64_PSR_DT_BIT
(p6) br.dptk kvm_resume_to_guest (p6) br.dptk kvm_resume_to_guest_with_sync
;; ;;
add r26=VMM_VCPU_META_RR0_OFFSET,r21 add r26=VMM_VCPU_META_RR0_OFFSET,r21
add r27=VMM_VCPU_META_RR0_OFFSET+8,r21 add r27=VMM_VCPU_META_RR0_OFFSET+8,r21
...@@ -212,7 +307,7 @@ GLOBAL_ENTRY(kvm_asm_rsm) ...@@ -212,7 +307,7 @@ GLOBAL_ENTRY(kvm_asm_rsm)
mov rr[r28]=r27 mov rr[r28]=r27
;; ;;
srlz.d srlz.d
br.many kvm_resume_to_guest br.many kvm_resume_to_guest_with_sync
END(kvm_asm_rsm) END(kvm_asm_rsm)
...@@ -221,11 +316,11 @@ GLOBAL_ENTRY(kvm_asm_ssm) ...@@ -221,11 +316,11 @@ GLOBAL_ENTRY(kvm_asm_ssm)
#ifndef ACCE_SSM #ifndef ACCE_SSM
br.many kvm_virtualization_fault_back br.many kvm_virtualization_fault_back
#endif #endif
add r16=VMM_VPD_BASE_OFFSET,r21 VMX_VPS_SYNC_READ
;;
extr.u r26=r25,6,21 extr.u r26=r25,6,21
extr.u r27=r25,31,2 extr.u r27=r25,31,2
;; ;;
ld8 r16=[r16]
extr.u r28=r25,36,1 extr.u r28=r25,36,1
dep r26=r27,r26,21,2 dep r26=r27,r26,21,2
;; //r26 is imm24 ;; //r26 is imm24
...@@ -271,7 +366,7 @@ kvm_asm_ssm_1: ...@@ -271,7 +366,7 @@ kvm_asm_ssm_1:
tbit.nz p6,p0=r29,IA64_PSR_I_BIT tbit.nz p6,p0=r29,IA64_PSR_I_BIT
;; ;;
tbit.z.or p6,p0=r19,IA64_PSR_I_BIT tbit.z.or p6,p0=r19,IA64_PSR_I_BIT
(p6) br.dptk kvm_resume_to_guest (p6) br.dptk kvm_resume_to_guest_with_sync
;; ;;
add r29=VPD_VTPR_START_OFFSET,r16 add r29=VPD_VTPR_START_OFFSET,r16
add r30=VPD_VHPI_START_OFFSET,r16 add r30=VPD_VHPI_START_OFFSET,r16
...@@ -286,7 +381,7 @@ kvm_asm_ssm_1: ...@@ -286,7 +381,7 @@ kvm_asm_ssm_1:
;; ;;
cmp.gt p6,p0=r30,r17 cmp.gt p6,p0=r30,r17
(p6) br.dpnt.few kvm_asm_dispatch_vexirq (p6) br.dpnt.few kvm_asm_dispatch_vexirq
br.many kvm_resume_to_guest br.many kvm_resume_to_guest_with_sync
END(kvm_asm_ssm) END(kvm_asm_ssm)
...@@ -295,10 +390,9 @@ GLOBAL_ENTRY(kvm_asm_mov_to_psr) ...@@ -295,10 +390,9 @@ GLOBAL_ENTRY(kvm_asm_mov_to_psr)
#ifndef ACCE_MOV_TO_PSR #ifndef ACCE_MOV_TO_PSR
br.many kvm_virtualization_fault_back br.many kvm_virtualization_fault_back
#endif #endif
add r16=VMM_VPD_BASE_OFFSET,r21 VMX_VPS_SYNC_READ
extr.u r26=r25,13,7 //r2
;; ;;
ld8 r16=[r16] extr.u r26=r25,13,7 //r2
addl r20=@gprel(asm_mov_from_reg),gp addl r20=@gprel(asm_mov_from_reg),gp
;; ;;
adds r30=kvm_asm_mov_to_psr_back-asm_mov_from_reg,r20 adds r30=kvm_asm_mov_to_psr_back-asm_mov_from_reg,r20
...@@ -374,7 +468,7 @@ kvm_asm_mov_to_psr_1: ...@@ -374,7 +468,7 @@ kvm_asm_mov_to_psr_1:
;; ;;
tbit.nz.or p6,p0=r17,IA64_PSR_I_BIT tbit.nz.or p6,p0=r17,IA64_PSR_I_BIT
tbit.z.or p6,p0=r30,IA64_PSR_I_BIT tbit.z.or p6,p0=r30,IA64_PSR_I_BIT
(p6) br.dpnt.few kvm_resume_to_guest (p6) br.dpnt.few kvm_resume_to_guest_with_sync
;; ;;
add r29=VPD_VTPR_START_OFFSET,r16 add r29=VPD_VTPR_START_OFFSET,r16
add r30=VPD_VHPI_START_OFFSET,r16 add r30=VPD_VHPI_START_OFFSET,r16
...@@ -389,13 +483,29 @@ kvm_asm_mov_to_psr_1: ...@@ -389,13 +483,29 @@ kvm_asm_mov_to_psr_1:
;; ;;
cmp.gt p6,p0=r30,r17 cmp.gt p6,p0=r30,r17
(p6) br.dpnt.few kvm_asm_dispatch_vexirq (p6) br.dpnt.few kvm_asm_dispatch_vexirq
br.many kvm_resume_to_guest br.many kvm_resume_to_guest_with_sync
END(kvm_asm_mov_to_psr) END(kvm_asm_mov_to_psr)
ENTRY(kvm_asm_dispatch_vexirq) ENTRY(kvm_asm_dispatch_vexirq)
//increment iip //increment iip
mov r17 = b0
mov r18 = r31
{.mii
add r25=VMM_VPD_BASE_OFFSET,r21
nop 0x0
mov r24 = ip
;;
}
{.mmb
add r24 = 0x20, r24
ld8 r25 = [r25]
br.sptk.many kvm_vps_sync_write
}
mov b0 =r17
mov r16=cr.ipsr mov r16=cr.ipsr
mov r31 = r18
mov r19 = 37
;; ;;
extr.u r17=r16,IA64_PSR_RI_BIT,2 extr.u r17=r16,IA64_PSR_RI_BIT,2
tbit.nz p6,p7=r16,IA64_PSR_RI_BIT+1 tbit.nz p6,p7=r16,IA64_PSR_RI_BIT+1
...@@ -435,20 +545,26 @@ GLOBAL_ENTRY(kvm_asm_thash) ...@@ -435,20 +545,26 @@ GLOBAL_ENTRY(kvm_asm_thash)
;; ;;
kvm_asm_thash_back1: kvm_asm_thash_back1:
shr.u r23=r19,61 // get RR number shr.u r23=r19,61 // get RR number
adds r25=VMM_VCPU_VRR0_OFFSET,r21 // get vcpu->arch.vrr[0]'s addr adds r28=VMM_VCPU_VRR0_OFFSET,r21 // get vcpu->arch.vrr[0]'s addr
adds r16=VMM_VPD_VPTA_OFFSET,r16 // get vpta adds r16=VMM_VPD_VPTA_OFFSET,r16 // get vpta
;; ;;
shladd r27=r23,3,r25 // get vcpu->arch.vrr[r23]'s addr shladd r27=r23,3,r28 // get vcpu->arch.vrr[r23]'s addr
ld8 r17=[r16] // get PTA ld8 r17=[r16] // get PTA
mov r26=1 mov r26=1
;; ;;
extr.u r29=r17,2,6 // get pta.size extr.u r29=r17,2,6 // get pta.size
ld8 r25=[r27] // get vcpu->arch.vrr[r23]'s value ld8 r28=[r27] // get vcpu->arch.vrr[r23]'s value
;; ;;
extr.u r25=r25,2,6 // get rr.ps mov b0=r24
//Fallback to C if pta.vf is set
tbit.nz p6,p0=r17, 8
;;
(p6) mov r24=EVENT_THASH
(p6) br.cond.dpnt.many kvm_virtualization_fault_back
extr.u r28=r28,2,6 // get rr.ps
shl r22=r26,r29 // 1UL << pta.size shl r22=r26,r29 // 1UL << pta.size
;; ;;
shr.u r23=r19,r25 // vaddr >> rr.ps shr.u r23=r19,r28 // vaddr >> rr.ps
adds r26=3,r29 // pta.size + 3 adds r26=3,r29 // pta.size + 3
shl r27=r17,3 // pta << 3 shl r27=r17,3 // pta << 3
;; ;;
...@@ -724,6 +840,29 @@ END(asm_mov_from_reg) ...@@ -724,6 +840,29 @@ END(asm_mov_from_reg)
* r31: pr * r31: pr
* r24: b0 * r24: b0
*/ */
ENTRY(kvm_resume_to_guest_with_sync)
adds r19=VMM_VPD_BASE_OFFSET,r21
mov r16 = r31
mov r17 = r24
;;
{.mii
ld8 r25 =[r19]
nop 0x0
mov r24 = ip
;;
}
{.mmb
add r24 =0x20, r24
nop 0x0
br.sptk.many kvm_vps_sync_write
}
mov r31 = r16
mov r24 =r17
;;
br.sptk.many kvm_resume_to_guest
END(kvm_resume_to_guest_with_sync)
ENTRY(kvm_resume_to_guest) ENTRY(kvm_resume_to_guest)
adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21 adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
;; ;;
......
...@@ -962,9 +962,9 @@ static void kvm_do_resume_op(struct kvm_vcpu *vcpu) ...@@ -962,9 +962,9 @@ static void kvm_do_resume_op(struct kvm_vcpu *vcpu)
void vmm_transition(struct kvm_vcpu *vcpu) void vmm_transition(struct kvm_vcpu *vcpu)
{ {
ia64_call_vsa(PAL_VPS_SAVE, (unsigned long)vcpu->arch.vpd, ia64_call_vsa(PAL_VPS_SAVE, (unsigned long)vcpu->arch.vpd,
0, 0, 0, 0, 0, 0); 1, 0, 0, 0, 0, 0);
vmm_trampoline(&vcpu->arch.guest, &vcpu->arch.host); vmm_trampoline(&vcpu->arch.guest, &vcpu->arch.host);
ia64_call_vsa(PAL_VPS_RESTORE, (unsigned long)vcpu->arch.vpd, ia64_call_vsa(PAL_VPS_RESTORE, (unsigned long)vcpu->arch.vpd,
0, 0, 0, 0, 0, 0); 1, 0, 0, 0, 0, 0);
kvm_do_resume_op(vcpu); kvm_do_resume_op(vcpu);
} }
...@@ -313,21 +313,21 @@ static inline void vcpu_set_tr(struct thash_data *trp, u64 pte, u64 itir, ...@@ -313,21 +313,21 @@ static inline void vcpu_set_tr(struct thash_data *trp, u64 pte, u64 itir,
trp->rid = rid; trp->rid = rid;
} }
extern u64 kvm_lookup_mpa(u64 gpfn); extern u64 kvm_get_mpt_entry(u64 gpfn);
extern u64 kvm_gpa_to_mpa(u64 gpa);
/* Return I/O type if trye */
#define __gpfn_is_io(gpfn) \
({ \
u64 pte, ret = 0; \
pte = kvm_lookup_mpa(gpfn); \
if (!(pte & GPFN_INV_MASK)) \
ret = pte & GPFN_IO_MASK; \
ret; \
})
/* Return I/ */
static inline u64 __gpfn_is_io(u64 gpfn)
{
u64 pte;
pte = kvm_get_mpt_entry(gpfn);
if (!(pte & GPFN_INV_MASK)) {
pte = pte & GPFN_IO_MASK;
if (pte != GPFN_PHYS_MMIO)
return pte;
}
return 0;
}
#endif #endif
#define IA64_NO_FAULT 0 #define IA64_NO_FAULT 0
#define IA64_FAULT 1 #define IA64_FAULT 1
......
...@@ -1261,11 +1261,6 @@ kvm_rse_clear_invalid: ...@@ -1261,11 +1261,6 @@ kvm_rse_clear_invalid:
adds r19=VMM_VPD_VPSR_OFFSET,r18 adds r19=VMM_VPD_VPSR_OFFSET,r18
;; ;;
ld8 r19=[r19] //vpsr ld8 r19=[r19] //vpsr
adds r20=VMM_VCPU_VSA_BASE_OFFSET,r21
;;
ld8 r20=[r20]
;;
//vsa_sync_write_start
mov r25=r18 mov r25=r18
adds r16= VMM_VCPU_GP_OFFSET,r21 adds r16= VMM_VCPU_GP_OFFSET,r21
;; ;;
...@@ -1274,10 +1269,7 @@ kvm_rse_clear_invalid: ...@@ -1274,10 +1269,7 @@ kvm_rse_clear_invalid:
;; ;;
add r24=r24,r16 add r24=r24,r16
;; ;;
add r16=PAL_VPS_SYNC_WRITE,r20 br.sptk.many kvm_vps_sync_write // call the service
;;
mov b0=r16
br.cond.sptk b0 // call the service
;; ;;
END(ia64_leave_hypervisor) END(ia64_leave_hypervisor)
// fall through // fall through
...@@ -1288,28 +1280,15 @@ GLOBAL_ENTRY(ia64_vmm_entry) ...@@ -1288,28 +1280,15 @@ GLOBAL_ENTRY(ia64_vmm_entry)
* r17:cr.isr * r17:cr.isr
* r18:vpd * r18:vpd
* r19:vpsr * r19:vpsr
* r20:__vsa_base
* r22:b0 * r22:b0
* r23:predicate * r23:predicate
*/ */
mov r24=r22 mov r24=r22
mov r25=r18 mov r25=r18
tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT // p1=vpsr.ic tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT // p1=vpsr.ic
(p1) br.cond.sptk.few kvm_vps_resume_normal
(p2) br.cond.sptk.many kvm_vps_resume_handler
;; ;;
(p1) add r29=PAL_VPS_RESUME_NORMAL,r20
(p1) br.sptk.many ia64_vmm_entry_out
;;
tbit.nz p1,p2 = r17,IA64_ISR_IR_BIT //p1=cr.isr.ir
;;
(p1) add r29=PAL_VPS_RESUME_NORMAL,r20
(p2) add r29=PAL_VPS_RESUME_HANDLER,r20
(p2) ld8 r26=[r25]
;;
ia64_vmm_entry_out:
mov pr=r23,-2
mov b0=r29
;;
br.cond.sptk b0 // call pal service
END(ia64_vmm_entry) END(ia64_vmm_entry)
...@@ -1376,6 +1355,9 @@ GLOBAL_ENTRY(vmm_reset_entry) ...@@ -1376,6 +1355,9 @@ GLOBAL_ENTRY(vmm_reset_entry)
//set up ipsr, iip, vpd.vpsr, dcr //set up ipsr, iip, vpd.vpsr, dcr
// For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1 // For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1
// For DCR: all bits 0 // For DCR: all bits 0
bsw.0
;;
mov r21 =r13
adds r14=-VMM_PT_REGS_SIZE, r12 adds r14=-VMM_PT_REGS_SIZE, r12
;; ;;
movl r6=0x501008826000 // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1 movl r6=0x501008826000 // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1
...@@ -1387,12 +1369,6 @@ GLOBAL_ENTRY(vmm_reset_entry) ...@@ -1387,12 +1369,6 @@ GLOBAL_ENTRY(vmm_reset_entry)
;; ;;
srlz.i srlz.i
;; ;;
bsw.0
;;
mov r21 =r13
;;
bsw.1
;;
mov ar.rsc = 0 mov ar.rsc = 0
;; ;;
flushrs flushrs
...@@ -1406,12 +1382,9 @@ GLOBAL_ENTRY(vmm_reset_entry) ...@@ -1406,12 +1382,9 @@ GLOBAL_ENTRY(vmm_reset_entry)
ld8 r1 = [r20] ld8 r1 = [r20]
;; ;;
mov cr.iip=r4 mov cr.iip=r4
;;
adds r16=VMM_VPD_BASE_OFFSET,r13 adds r16=VMM_VPD_BASE_OFFSET,r13
adds r20=VMM_VCPU_VSA_BASE_OFFSET,r13
;; ;;
ld8 r18=[r16] ld8 r18=[r16]
ld8 r20=[r20]
;; ;;
adds r19=VMM_VPD_VPSR_OFFSET,r18 adds r19=VMM_VPD_VPSR_OFFSET,r18
;; ;;
......
...@@ -390,7 +390,7 @@ void thash_purge_entries_remote(struct kvm_vcpu *v, u64 va, u64 ps) ...@@ -390,7 +390,7 @@ void thash_purge_entries_remote(struct kvm_vcpu *v, u64 va, u64 ps)
u64 translate_phy_pte(u64 *pte, u64 itir, u64 va) u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
{ {
u64 ps, ps_mask, paddr, maddr; u64 ps, ps_mask, paddr, maddr, io_mask;
union pte_flags phy_pte; union pte_flags phy_pte;
ps = itir_ps(itir); ps = itir_ps(itir);
...@@ -398,8 +398,9 @@ u64 translate_phy_pte(u64 *pte, u64 itir, u64 va) ...@@ -398,8 +398,9 @@ u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
phy_pte.val = *pte; phy_pte.val = *pte;
paddr = *pte; paddr = *pte;
paddr = ((paddr & _PAGE_PPN_MASK) & ps_mask) | (va & ~ps_mask); paddr = ((paddr & _PAGE_PPN_MASK) & ps_mask) | (va & ~ps_mask);
maddr = kvm_lookup_mpa(paddr >> PAGE_SHIFT); maddr = kvm_get_mpt_entry(paddr >> PAGE_SHIFT);
if (maddr & GPFN_IO_MASK) { io_mask = maddr & GPFN_IO_MASK;
if (io_mask && (io_mask != GPFN_PHYS_MMIO)) {
*pte |= VTLB_PTE_IO; *pte |= VTLB_PTE_IO;
return -1; return -1;
} }
...@@ -418,7 +419,7 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir, ...@@ -418,7 +419,7 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
u64 ifa, int type) u64 ifa, int type)
{ {
u64 ps; u64 ps;
u64 phy_pte; u64 phy_pte, io_mask, index;
union ia64_rr vrr, mrr; union ia64_rr vrr, mrr;
int ret = 0; int ret = 0;
...@@ -426,13 +427,16 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir, ...@@ -426,13 +427,16 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
vrr.val = vcpu_get_rr(v, ifa); vrr.val = vcpu_get_rr(v, ifa);
mrr.val = ia64_get_rr(ifa); mrr.val = ia64_get_rr(ifa);
index = (pte & _PAGE_PPN_MASK) >> PAGE_SHIFT;
io_mask = kvm_get_mpt_entry(index) & GPFN_IO_MASK;
phy_pte = translate_phy_pte(&pte, itir, ifa); phy_pte = translate_phy_pte(&pte, itir, ifa);
/* Ensure WB attribute if pte is related to a normal mem page, /* Ensure WB attribute if pte is related to a normal mem page,
* which is required by vga acceleration since qemu maps shared * which is required by vga acceleration since qemu maps shared
* vram buffer with WB. * vram buffer with WB.
*/ */
if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT)) { if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT) &&
io_mask != GPFN_PHYS_MMIO) {
pte &= ~_PAGE_MA_MASK; pte &= ~_PAGE_MA_MASK;
phy_pte &= ~_PAGE_MA_MASK; phy_pte &= ~_PAGE_MA_MASK;
} }
...@@ -566,12 +570,19 @@ void thash_init(struct thash_cb *hcb, u64 sz) ...@@ -566,12 +570,19 @@ void thash_init(struct thash_cb *hcb, u64 sz)
} }
} }
u64 kvm_lookup_mpa(u64 gpfn) u64 kvm_get_mpt_entry(u64 gpfn)
{ {
u64 *base = (u64 *) KVM_P2M_BASE; u64 *base = (u64 *) KVM_P2M_BASE;
return *(base + gpfn); return *(base + gpfn);
} }
u64 kvm_lookup_mpa(u64 gpfn)
{
u64 maddr;
maddr = kvm_get_mpt_entry(gpfn);
return maddr&_PAGE_PPN_MASK;
}
u64 kvm_gpa_to_mpa(u64 gpa) u64 kvm_gpa_to_mpa(u64 gpa)
{ {
u64 pte = kvm_lookup_mpa(gpa >> PAGE_SHIFT); u64 pte = kvm_lookup_mpa(gpa >> PAGE_SHIFT);
......
...@@ -81,11 +81,17 @@ struct kvm_vcpu_arch { ...@@ -81,11 +81,17 @@ struct kvm_vcpu_arch {
struct tlbe shadow_tlb[PPC44x_TLB_SIZE]; struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
/* Pages which are referenced in the shadow TLB. */ /* Pages which are referenced in the shadow TLB. */
struct page *shadow_pages[PPC44x_TLB_SIZE]; struct page *shadow_pages[PPC44x_TLB_SIZE];
/* Copy of the host's TLB. */
struct tlbe host_tlb[PPC44x_TLB_SIZE]; /* Track which TLB entries we've modified in the current exit. */
u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
u32 host_stack; u32 host_stack;
u32 host_pid; u32 host_pid;
u32 host_dbcr0;
u32 host_dbcr1;
u32 host_dbcr2;
u32 host_iac[4];
u32 host_msr;
u64 fpr[32]; u64 fpr[32];
u32 gpr[32]; u32 gpr[32];
...@@ -123,7 +129,11 @@ struct kvm_vcpu_arch { ...@@ -123,7 +129,11 @@ struct kvm_vcpu_arch {
u32 ivor[16]; u32 ivor[16];
u32 ivpr; u32 ivpr;
u32 pir; u32 pir;
u32 shadow_pid;
u32 pid; u32 pid;
u32 swap_pid;
u32 pvr; u32 pvr;
u32 ccr0; u32 ccr0;
u32 ccr1; u32 ccr1;
......
...@@ -64,6 +64,10 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, ...@@ -64,6 +64,10 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr, extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
gva_t eend, u32 asid); gva_t eend, u32 asid);
extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode); extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
/* XXX Book E specific */
extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
extern void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu); extern void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu);
...@@ -92,4 +96,12 @@ static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) ...@@ -92,4 +96,12 @@ static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
kvm_vcpu_block(vcpu); kvm_vcpu_block(vcpu);
} }
static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
{
if (vcpu->arch.pid != new_pid) {
vcpu->arch.pid = new_pid;
vcpu->arch.swap_pid = 1;
}
}
#endif /* __POWERPC_KVM_PPC_H__ */ #endif /* __POWERPC_KVM_PPC_H__ */
...@@ -359,8 +359,8 @@ int main(void) ...@@ -359,8 +359,8 @@ int main(void)
DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack)); DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
DEFINE(VCPU_HOST_TLB, offsetof(struct kvm_vcpu, arch.host_tlb));
DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb)); DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb));
DEFINE(VCPU_SHADOW_MOD, offsetof(struct kvm_vcpu, arch.shadow_tlb_mod));
DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
...@@ -372,7 +372,7 @@ int main(void) ...@@ -372,7 +372,7 @@ int main(void)
DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5)); DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6)); DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7)); DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
DEFINE(VCPU_PID, offsetof(struct kvm_vcpu, arch.pid)); DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h> #include <linux/kvm_host.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <asm/mmu-44x.h> #include <asm/mmu-44x.h>
...@@ -109,7 +110,6 @@ static int kvmppc_44x_tlbe_is_writable(struct tlbe *tlbe) ...@@ -109,7 +110,6 @@ static int kvmppc_44x_tlbe_is_writable(struct tlbe *tlbe)
return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW); return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW);
} }
/* Must be called with mmap_sem locked for writing. */
static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu, static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
unsigned int index) unsigned int index)
{ {
...@@ -124,6 +124,11 @@ static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu, ...@@ -124,6 +124,11 @@ static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
} }
} }
void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
{
vcpu->arch.shadow_tlb_mod[i] = 1;
}
/* Caller must ensure that the specified guest TLB entry is safe to insert into /* Caller must ensure that the specified guest TLB entry is safe to insert into
* the shadow TLB. */ * the shadow TLB. */
void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid, void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
...@@ -142,19 +147,16 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid, ...@@ -142,19 +147,16 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
stlbe = &vcpu->arch.shadow_tlb[victim]; stlbe = &vcpu->arch.shadow_tlb[victim];
/* Get reference to new page. */ /* Get reference to new page. */
down_read(&current->mm->mmap_sem);
new_page = gfn_to_page(vcpu->kvm, gfn); new_page = gfn_to_page(vcpu->kvm, gfn);
if (is_error_page(new_page)) { if (is_error_page(new_page)) {
printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn); printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
kvm_release_page_clean(new_page); kvm_release_page_clean(new_page);
up_read(&current->mm->mmap_sem);
return; return;
} }
hpaddr = page_to_phys(new_page); hpaddr = page_to_phys(new_page);
/* Drop reference to old page. */ /* Drop reference to old page. */
kvmppc_44x_shadow_release(vcpu, victim); kvmppc_44x_shadow_release(vcpu, victim);
up_read(&current->mm->mmap_sem);
vcpu->arch.shadow_pages[victim] = new_page; vcpu->arch.shadow_pages[victim] = new_page;
...@@ -164,27 +166,30 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid, ...@@ -164,27 +166,30 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
/* XXX what about AS? */ /* XXX what about AS? */
stlbe->tid = asid & 0xff; stlbe->tid = !(asid & 0xff);
/* Force TS=1 for all guest mappings. */ /* Force TS=1 for all guest mappings. */
/* For now we hardcode 4KB mappings, but it will be important to /* For now we hardcode 4KB mappings, but it will be important to
* use host large pages in the future. */ * use host large pages in the future. */
stlbe->word0 = (gvaddr & PAGE_MASK) | PPC44x_TLB_VALID | PPC44x_TLB_TS stlbe->word0 = (gvaddr & PAGE_MASK) | PPC44x_TLB_VALID | PPC44x_TLB_TS
| PPC44x_TLB_4K; | PPC44x_TLB_4K;
stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf); stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags, stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags,
vcpu->arch.msr & MSR_PR); vcpu->arch.msr & MSR_PR);
kvmppc_tlbe_set_modified(vcpu, victim);
KVMTRACE_5D(STLB_WRITE, vcpu, victim,
stlbe->tid, stlbe->word0, stlbe->word1, stlbe->word2,
handler);
} }
void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr, void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
gva_t eend, u32 asid) gva_t eend, u32 asid)
{ {
unsigned int pid = asid & 0xff; unsigned int pid = !(asid & 0xff);
int i; int i;
/* XXX Replace loop with fancy data structures. */ /* XXX Replace loop with fancy data structures. */
down_write(&current->mm->mmap_sem);
for (i = 0; i <= tlb_44x_hwater; i++) { for (i = 0; i <= tlb_44x_hwater; i++) {
struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i]; struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
unsigned int tid; unsigned int tid;
...@@ -204,21 +209,35 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr, ...@@ -204,21 +209,35 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
kvmppc_44x_shadow_release(vcpu, i); kvmppc_44x_shadow_release(vcpu, i);
stlbe->word0 = 0; stlbe->word0 = 0;
kvmppc_tlbe_set_modified(vcpu, i);
KVMTRACE_5D(STLB_INVAL, vcpu, i,
stlbe->tid, stlbe->word0, stlbe->word1,
stlbe->word2, handler);
} }
up_write(&current->mm->mmap_sem);
} }
/* Invalidate all mappings, so that when they fault back in they will get the /* Invalidate all mappings on the privilege switch after PID has been changed.
* proper permission bits. */ * The guest always runs with PID=1, so we must clear the entire TLB when
* switching address spaces. */
void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode) void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
{ {
int i; int i;
if (vcpu->arch.swap_pid) {
/* XXX Replace loop with fancy data structures. */ /* XXX Replace loop with fancy data structures. */
down_write(&current->mm->mmap_sem);
for (i = 0; i <= tlb_44x_hwater; i++) { for (i = 0; i <= tlb_44x_hwater; i++) {
struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
/* Future optimization: clear only userspace mappings. */
kvmppc_44x_shadow_release(vcpu, i); kvmppc_44x_shadow_release(vcpu, i);
vcpu->arch.shadow_tlb[i].word0 = 0; stlbe->word0 = 0;
kvmppc_tlbe_set_modified(vcpu, i);
KVMTRACE_5D(STLB_INVAL, vcpu, i,
stlbe->tid, stlbe->word0, stlbe->word1,
stlbe->word2, handler);
} }
up_write(&current->mm->mmap_sem); vcpu->arch.swap_pid = 0;
}
vcpu->arch.shadow_pid = !usermode;
} }
...@@ -37,6 +37,17 @@ config KVM_BOOKE_HOST ...@@ -37,6 +37,17 @@ config KVM_BOOKE_HOST
Provides host support for KVM on Book E PowerPC processors. Currently Provides host support for KVM on Book E PowerPC processors. Currently
this works on 440 processors only. this works on 440 processors only.
config KVM_TRACE
bool "KVM trace support"
depends on KVM && MARKERS && SYSFS
select RELAY
select DEBUG_FS
default n
---help---
This option allows reading a trace of kvm-related events through
relayfs. Note the ABI is not considered stable and will be
modified in future updates.
source drivers/virtio/Kconfig source drivers/virtio/Kconfig
endif # VIRTUALIZATION endif # VIRTUALIZATION
...@@ -4,9 +4,11 @@ ...@@ -4,9 +4,11 @@
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm
common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
kvm-objs := $(common-objs) powerpc.o emulate.o booke_guest.o common-objs-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o)
kvm-objs := $(common-objs-y) powerpc.o emulate.o booke_guest.o
obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM) += kvm.o
AFLAGS_booke_interrupts.o := -I$(obj) AFLAGS_booke_interrupts.o := -I$(obj)
......
...@@ -410,6 +410,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -410,6 +410,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
break; break;
} }
case BOOKE_INTERRUPT_DEBUG: {
u32 dbsr;
vcpu->arch.pc = mfspr(SPRN_CSRR0);
/* clear IAC events in DBSR register */
dbsr = mfspr(SPRN_DBSR);
dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4;
mtspr(SPRN_DBSR, dbsr);
run->exit_reason = KVM_EXIT_DEBUG;
r = RESUME_HOST;
break;
}
default: default:
printk(KERN_EMERG "exit_nr %d\n", exit_nr); printk(KERN_EMERG "exit_nr %d\n", exit_nr);
BUG(); BUG();
...@@ -471,6 +486,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) ...@@ -471,6 +486,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->arch.msr = 0; vcpu->arch.msr = 0;
vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */ vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */
vcpu->arch.shadow_pid = 1;
/* Eye-catching number so we know if the guest takes an interrupt /* Eye-catching number so we know if the guest takes an interrupt
* before it's programmed its own IVPR. */ * before it's programmed its own IVPR. */
vcpu->arch.ivpr = 0x55550000; vcpu->arch.ivpr = 0x55550000;
......
...@@ -42,7 +42,8 @@ ...@@ -42,7 +42,8 @@
#define HOST_STACK_LR (HOST_STACK_SIZE + 4) /* In caller stack frame. */ #define HOST_STACK_LR (HOST_STACK_SIZE + 4) /* In caller stack frame. */
#define NEED_INST_MASK ((1<<BOOKE_INTERRUPT_PROGRAM) | \ #define NEED_INST_MASK ((1<<BOOKE_INTERRUPT_PROGRAM) | \
(1<<BOOKE_INTERRUPT_DTLB_MISS)) (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
(1<<BOOKE_INTERRUPT_DEBUG))
#define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ #define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
(1<<BOOKE_INTERRUPT_DTLB_MISS)) (1<<BOOKE_INTERRUPT_DTLB_MISS))
...@@ -331,51 +332,57 @@ lightweight_exit: ...@@ -331,51 +332,57 @@ lightweight_exit:
mfspr r3, SPRN_PID mfspr r3, SPRN_PID
stw r3, VCPU_HOST_PID(r4) stw r3, VCPU_HOST_PID(r4)
lwz r3, VCPU_PID(r4) lwz r3, VCPU_SHADOW_PID(r4)
mtspr SPRN_PID, r3 mtspr SPRN_PID, r3
/* Prevent all TLB updates. */ /* Prevent all asynchronous TLB updates. */
mfmsr r5 mfmsr r5
lis r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@h lis r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@h
ori r6, r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l ori r6, r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
andc r6, r5, r6 andc r6, r5, r6
mtmsr r6 mtmsr r6
/* Save the host's non-pinned TLB mappings, and load the guest mappings /* Load the guest mappings, leaving the host's "pinned" kernel mappings
* over them. Leave the host's "pinned" kernel mappings in place. */ * in place. */
/* XXX optimization: use generation count to avoid swapping unmodified
* entries. */
mfspr r10, SPRN_MMUCR /* Save host MMUCR. */ mfspr r10, SPRN_MMUCR /* Save host MMUCR. */
lis r8, tlb_44x_hwater@ha li r5, PPC44x_TLB_SIZE
lwz r8, tlb_44x_hwater@l(r8) lis r5, tlb_44x_hwater@ha
addi r3, r4, VCPU_HOST_TLB - 4 lwz r5, tlb_44x_hwater@l(r5)
addi r9, r4, VCPU_SHADOW_TLB - 4 mtctr r5
li r6, 0 addi r9, r4, VCPU_SHADOW_TLB
addi r5, r4, VCPU_SHADOW_MOD
li r3, 0
1: 1:
/* Save host entry. */ lbzx r7, r3, r5
tlbre r7, r6, PPC44x_TLB_PAGEID cmpwi r7, 0
mfspr r5, SPRN_MMUCR beq 3f
stwu r5, 4(r3)
stwu r7, 4(r3)
tlbre r7, r6, PPC44x_TLB_XLAT
stwu r7, 4(r3)
tlbre r7, r6, PPC44x_TLB_ATTRIB
stwu r7, 4(r3)
/* Load guest entry. */ /* Load guest entry. */
lwzu r7, 4(r9) mulli r11, r3, TLBE_BYTES
add r11, r11, r9
lwz r7, 0(r11)
mtspr SPRN_MMUCR, r7 mtspr SPRN_MMUCR, r7
lwzu r7, 4(r9) lwz r7, 4(r11)
tlbwe r7, r6, PPC44x_TLB_PAGEID tlbwe r7, r3, PPC44x_TLB_PAGEID
lwzu r7, 4(r9) lwz r7, 8(r11)
tlbwe r7, r6, PPC44x_TLB_XLAT tlbwe r7, r3, PPC44x_TLB_XLAT
lwzu r7, 4(r9) lwz r7, 12(r11)
tlbwe r7, r6, PPC44x_TLB_ATTRIB tlbwe r7, r3, PPC44x_TLB_ATTRIB
/* Increment index. */ 3:
addi r6, r6, 1 addi r3, r3, 1 /* Increment index. */
cmpw r6, r8 bdnz 1b
blt 1b
mtspr SPRN_MMUCR, r10 /* Restore host MMUCR. */ mtspr SPRN_MMUCR, r10 /* Restore host MMUCR. */
/* Clear bitmap of modified TLB entries */
li r5, PPC44x_TLB_SIZE>>2
mtctr r5
addi r5, r4, VCPU_SHADOW_MOD - 4
li r6, 0
1:
stwu r6, 4(r5)
bdnz 1b
iccci 0, 0 /* XXX hack */ iccci 0, 0 /* XXX hack */
/* Load some guest volatiles. */ /* Load some guest volatiles. */
...@@ -431,6 +438,14 @@ lightweight_exit: ...@@ -431,6 +438,14 @@ lightweight_exit:
oris r3, r3, KVMPPC_MSR_MASK@h oris r3, r3, KVMPPC_MSR_MASK@h
ori r3, r3, KVMPPC_MSR_MASK@l ori r3, r3, KVMPPC_MSR_MASK@l
mtsrr1 r3 mtsrr1 r3
/* Clear any debug events which occurred since we disabled MSR[DE].
* XXX This gives us a 3-instruction window in which a breakpoint
* intended for guest context could fire in the host instead. */
lis r3, 0xffff
ori r3, r3, 0xffff
mtspr SPRN_DBSR, r3
lwz r3, VCPU_GPR(r3)(r4) lwz r3, VCPU_GPR(r3)(r4)
lwz r4, VCPU_GPR(r4)(r4) lwz r4, VCPU_GPR(r4)(r4)
rfi rfi
...@@ -170,6 +170,10 @@ static int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u32 inst) ...@@ -170,6 +170,10 @@ static int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u32 inst)
kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags); kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags);
} }
KVMTRACE_5D(GTLB_WRITE, vcpu, index,
tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2,
handler);
return EMULATE_DONE; return EMULATE_DONE;
} }
...@@ -504,7 +508,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ...@@ -504,7 +508,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
case SPRN_MMUCR: case SPRN_MMUCR:
vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break; vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
case SPRN_PID: case SPRN_PID:
vcpu->arch.pid = vcpu->arch.gpr[rs]; break; kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
case SPRN_CCR0: case SPRN_CCR0:
vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break; vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
case SPRN_CCR1: case SPRN_CCR1:
...@@ -765,6 +769,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ...@@ -765,6 +769,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
break; break;
} }
KVMTRACE_3D(PPC_INSTR, vcpu, inst, vcpu->arch.pc, emulated, entryexit);
if (advance) if (advance)
vcpu->arch.pc += 4; /* Advance past emulated instruction. */ vcpu->arch.pc += 4; /* Advance past emulated instruction. */
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <asm/cputable.h> #include <asm/cputable.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/kvm_ppc.h> #include <asm/kvm_ppc.h>
#include <asm/tlbflush.h>
gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
...@@ -239,18 +240,114 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) ...@@ -239,18 +240,114 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
{ {
} }
/* Note: clearing MSR[DE] just means that the debug interrupt will not be
* delivered *immediately*. Instead, it simply sets the appropriate DBSR bits.
* If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt
* will be delivered as an "imprecise debug event" (which is indicated by
* DBSR[IDE].
*/
static void kvmppc_disable_debug_interrupts(void)
{
mtmsr(mfmsr() & ~MSR_DE);
}
static void kvmppc_restore_host_debug_state(struct kvm_vcpu *vcpu)
{
kvmppc_disable_debug_interrupts();
mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]);
mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]);
mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]);
mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]);
mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1);
mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2);
mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0);
mtmsr(vcpu->arch.host_msr);
}
static void kvmppc_load_guest_debug_registers(struct kvm_vcpu *vcpu)
{
struct kvm_guest_debug *dbg = &vcpu->guest_debug;
u32 dbcr0 = 0;
vcpu->arch.host_msr = mfmsr();
kvmppc_disable_debug_interrupts();
/* Save host debug register state. */
vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1);
vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2);
vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3);
vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4);
vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0);
vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1);
vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2);
/* set registers up for guest */
if (dbg->bp[0]) {
mtspr(SPRN_IAC1, dbg->bp[0]);
dbcr0 |= DBCR0_IAC1 | DBCR0_IDM;
}
if (dbg->bp[1]) {
mtspr(SPRN_IAC2, dbg->bp[1]);
dbcr0 |= DBCR0_IAC2 | DBCR0_IDM;
}
if (dbg->bp[2]) {
mtspr(SPRN_IAC3, dbg->bp[2]);
dbcr0 |= DBCR0_IAC3 | DBCR0_IDM;
}
if (dbg->bp[3]) {
mtspr(SPRN_IAC4, dbg->bp[3]);
dbcr0 |= DBCR0_IAC4 | DBCR0_IDM;
}
mtspr(SPRN_DBCR0, dbcr0);
mtspr(SPRN_DBCR1, 0);
mtspr(SPRN_DBCR2, 0);
}
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{ {
int i;
if (vcpu->guest_debug.enabled)
kvmppc_load_guest_debug_registers(vcpu);
/* Mark every guest entry in the shadow TLB entry modified, so that they
* will all be reloaded on the next vcpu run (instead of being
* demand-faulted). */
for (i = 0; i <= tlb_44x_hwater; i++)
kvmppc_tlbe_set_modified(vcpu, i);
} }
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{ {
if (vcpu->guest_debug.enabled)
kvmppc_restore_host_debug_state(vcpu);
/* Don't leave guest TLB entries resident when being de-scheduled. */
/* XXX It would be nice to differentiate between heavyweight exit and
* sched_out here, since we could avoid the TLB flush for heavyweight
* exits. */
_tlbia();
} }
int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
struct kvm_debug_guest *dbg) struct kvm_debug_guest *dbg)
{ {
return -ENOTSUPP; int i;
vcpu->guest_debug.enabled = dbg->enabled;
if (vcpu->guest_debug.enabled) {
for (i=0; i < ARRAY_SIZE(vcpu->guest_debug.bp); i++) {
if (dbg->breakpoints[i].enabled)
vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
else
vcpu->guest_debug.bp[i] = 0;
}
}
return 0;
} }
static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu, static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
......
...@@ -565,13 +565,16 @@ config ZFCPDUMP ...@@ -565,13 +565,16 @@ config ZFCPDUMP
Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this. Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this.
config S390_GUEST config S390_GUEST
bool "s390 guest support (EXPERIMENTAL)" bool "s390 guest support for KVM (EXPERIMENTAL)"
depends on 64BIT && EXPERIMENTAL depends on 64BIT && EXPERIMENTAL
select VIRTIO select VIRTIO
select VIRTIO_RING select VIRTIO_RING
select VIRTIO_CONSOLE select VIRTIO_CONSOLE
help help
Select this option if you want to run the kernel under s390 linux Select this option if you want to run the kernel as a guest under
the KVM hypervisor. This will add detection for KVM as well as a
virtio transport. If KVM is detected, the virtio console will be
the default console.
endmenu endmenu
source "net/Kconfig" source "net/Kconfig"
......
...@@ -157,8 +157,8 @@ static int handle_stfl(struct kvm_vcpu *vcpu) ...@@ -157,8 +157,8 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
int rc; int rc;
vcpu->stat.instruction_stfl++; vcpu->stat.instruction_stfl++;
facility_list &= ~(1UL<<24); /* no stfle */ /* only pass the facility bits, which we can handle */
facility_list &= ~(1UL<<23); /* no large pages */ facility_list &= 0xfe00fff3;
rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
&facility_list, sizeof(facility_list)); &facility_list, sizeof(facility_list));
......
...@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void) ...@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
return ret; return ret;
} }
/*
* If we don't do that, there is the possibility that the guest
* will calibrate under heavy load - thus, getting a lower lpj -
* and execute the delays themselves without load. This is wrong,
* because no delay loop can finish beforehand.
* Any heuristics is subject to fail, because ultimately, a large
* poll of guests can be running and trouble each other. So we preset
* lpj here
*/
static unsigned long kvm_get_tsc_khz(void)
{
return preset_lpj;
}
static void kvm_get_preset_lpj(void)
{
struct pvclock_vcpu_time_info *src;
unsigned long khz;
u64 lpj;
src = &per_cpu(hv_clock, 0);
khz = pvclock_tsc_khz(src);
lpj = ((u64)khz * 1000);
do_div(lpj, HZ);
preset_lpj = lpj;
}
static struct clocksource kvm_clock = { static struct clocksource kvm_clock = {
.name = "kvm-clock", .name = "kvm-clock",
.read = kvm_clock_read, .read = kvm_clock_read,
...@@ -153,6 +181,7 @@ void __init kvmclock_init(void) ...@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
pv_time_ops.get_wallclock = kvm_get_wallclock; pv_time_ops.get_wallclock = kvm_get_wallclock;
pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.set_wallclock = kvm_set_wallclock;
pv_time_ops.sched_clock = kvm_clock_read; pv_time_ops.sched_clock = kvm_clock_read;
pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
#ifdef CONFIG_X86_LOCAL_APIC #ifdef CONFIG_X86_LOCAL_APIC
pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
#endif #endif
...@@ -163,6 +192,7 @@ void __init kvmclock_init(void) ...@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
#ifdef CONFIG_KEXEC #ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown; machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif #endif
kvm_get_preset_lpj();
clocksource_register(&kvm_clock); clocksource_register(&kvm_clock);
} }
} }
...@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, ...@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
return dst->version; return dst->version;
} }
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
{
u64 pv_tsc_khz = 1000000ULL << 32;
do_div(pv_tsc_khz, src->tsc_to_system_mul);
if (src->tsc_shift < 0)
pv_tsc_khz <<= -src->tsc_shift;
else
pv_tsc_khz >>= src->tsc_shift;
return pv_tsc_khz;
}
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{ {
struct pvclock_shadow_time shadow; struct pvclock_shadow_time shadow;
......
...@@ -3,10 +3,13 @@ ...@@ -3,10 +3,13 @@
# #
common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
coalesced_mmio.o) coalesced_mmio.o irq_comm.o)
ifeq ($(CONFIG_KVM_TRACE),y) ifeq ($(CONFIG_KVM_TRACE),y)
common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
endif endif
ifeq ($(CONFIG_DMAR),y)
common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
endif
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
......
...@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) ...@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
if (!atomic_inc_and_test(&pt->pending)) if (!atomic_inc_and_test(&pt->pending))
set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; if (vcpu0 && waitqueue_active(&vcpu0->wq))
wake_up_interruptible(&vcpu0->wq); wake_up_interruptible(&vcpu0->wq);
}
pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
pt->scheduled = ktime_to_ns(pt->timer.expires); pt->scheduled = ktime_to_ns(pt->timer.expires);
if (pt->period)
ps->channels[0].count_load_time = pt->timer.expires;
return (pt->period == 0 ? 0 : 1); return (pt->period == 0 ? 0 : 1);
} }
...@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) ...@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
{ {
struct kvm_pit *pit = vcpu->kvm->arch.vpit; struct kvm_pit *pit = vcpu->kvm->arch.vpit;
if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
return atomic_read(&pit->pit_state.pit_timer.pending); return atomic_read(&pit->pit_state.pit_timer.pending);
return 0; return 0;
} }
static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
spin_lock(&ps->inject_lock);
if (atomic_dec_return(&ps->pit_timer.pending) < 0)
atomic_inc(&ps->pit_timer.pending);
ps->irq_ack = 1;
spin_unlock(&ps->inject_lock);
}
static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{ {
struct kvm_kpit_state *ps; struct kvm_kpit_state *ps;
...@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt) ...@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt)
hrtimer_cancel(&pt->timer); hrtimer_cancel(&pt->timer);
} }
static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
{ {
struct kvm_kpit_timer *pt = &ps->pit_timer;
s64 interval; s64 interval;
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
...@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) ...@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
pt->period = (is_period == 0) ? 0 : interval; pt->period = (is_period == 0) ? 0 : interval;
pt->timer.function = pit_timer_fn; pt->timer.function = pit_timer_fn;
atomic_set(&pt->pending, 0); atomic_set(&pt->pending, 0);
ps->irq_ack = 1;
hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS); HRTIMER_MODE_ABS);
...@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) ...@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
case 1: case 1:
/* FIXME: enhance mode 4 precision */ /* FIXME: enhance mode 4 precision */
case 4: case 4:
create_pit_timer(&ps->pit_timer, val, 0); create_pit_timer(ps, val, 0);
break; break;
case 2: case 2:
case 3: case 3:
create_pit_timer(&ps->pit_timer, val, 1); create_pit_timer(ps, val, 1);
break; break;
default: default:
destroy_pit_timer(&ps->pit_timer); destroy_pit_timer(&ps->pit_timer);
...@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit) ...@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
mutex_unlock(&pit->pit_state.lock); mutex_unlock(&pit->pit_state.lock);
atomic_set(&pit->pit_state.pit_timer.pending, 0); atomic_set(&pit->pit_state.pit_timer.pending, 0);
pit->pit_state.inject_pending = 1; pit->pit_state.irq_ack = 1;
} }
struct kvm_pit *kvm_create_pit(struct kvm *kvm) struct kvm_pit *kvm_create_pit(struct kvm *kvm)
...@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) ...@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
mutex_init(&pit->pit_state.lock); mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock);
spin_lock_init(&pit->pit_state.inject_lock);
/* Initialize PIO device */ /* Initialize PIO device */
pit->dev.read = pit_ioport_read; pit->dev.read = pit_ioport_read;
...@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) ...@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
pit_state->pit = pit; pit_state->pit = pit;
hrtimer_init(&pit_state->pit_timer.timer, hrtimer_init(&pit_state->pit_timer.timer,
CLOCK_MONOTONIC, HRTIMER_MODE_ABS); CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
mutex_unlock(&pit->pit_state.lock); mutex_unlock(&pit->pit_state.lock);
kvm_pit_reset(pit); kvm_pit_reset(pit);
...@@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm) ...@@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm)
static void __inject_pit_timer_intr(struct kvm *kvm) static void __inject_pit_timer_intr(struct kvm *kvm)
{ {
mutex_lock(&kvm->lock); mutex_lock(&kvm->lock);
kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); kvm_set_irq(kvm, 0, 1);
kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); kvm_set_irq(kvm, 0, 0);
kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
mutex_unlock(&kvm->lock); mutex_unlock(&kvm->lock);
} }
...@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) ...@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
struct kvm_kpit_state *ps; struct kvm_kpit_state *ps;
if (vcpu && pit) { if (vcpu && pit) {
int inject = 0;
ps = &pit->pit_state; ps = &pit->pit_state;
/* Try to inject pending interrupts when: /* Try to inject pending interrupts when
* 1. Pending exists * last one has been acked.
* 2. Last interrupt was accepted or waited for too long time*/ */
if (atomic_read(&ps->pit_timer.pending) && spin_lock(&ps->inject_lock);
(ps->inject_pending || if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
(jiffies - ps->last_injected_time ps->irq_ack = 0;
>= KVM_MAX_PIT_INTR_INTERVAL))) { inject = 1;
ps->inject_pending = 0;
__inject_pit_timer_intr(kvm);
ps->last_injected_time = jiffies;
}
}
}
void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
{
struct kvm_arch *arch = &vcpu->kvm->arch;
struct kvm_kpit_state *ps;
if (vcpu && arch->vpit) {
ps = &arch->vpit->pit_state;
if (atomic_read(&ps->pit_timer.pending) &&
(((arch->vpic->pics[0].imr & 1) == 0 &&
arch->vpic->pics[0].irq_base == vec) ||
(arch->vioapic->redirtbl[0].fields.vector == vec &&
arch->vioapic->redirtbl[0].fields.mask != 1))) {
ps->inject_pending = 1;
atomic_dec(&ps->pit_timer.pending);
ps->channels[0].count_load_time = ktime_get();
} }
spin_unlock(&ps->inject_lock);
if (inject)
__inject_pit_timer_intr(kvm);
} }
} }
...@@ -8,7 +8,6 @@ struct kvm_kpit_timer { ...@@ -8,7 +8,6 @@ struct kvm_kpit_timer {
int irq; int irq;
s64 period; /* unit: ns */ s64 period; /* unit: ns */
s64 scheduled; s64 scheduled;
ktime_t last_update;
atomic_t pending; atomic_t pending;
}; };
...@@ -34,8 +33,9 @@ struct kvm_kpit_state { ...@@ -34,8 +33,9 @@ struct kvm_kpit_state {
u32 speaker_data_on; u32 speaker_data_on;
struct mutex lock; struct mutex lock;
struct kvm_pit *pit; struct kvm_pit *pit;
bool inject_pending; /* if inject pending interrupts */ spinlock_t inject_lock;
unsigned long last_injected_time; unsigned long irq_ack;
struct kvm_irq_ack_notifier irq_ack_notifier;
}; };
struct kvm_pit { struct kvm_pit {
...@@ -54,7 +54,6 @@ struct kvm_pit { ...@@ -54,7 +54,6 @@ struct kvm_pit {
#define KVM_PIT_CHANNEL_MASK 0x3 #define KVM_PIT_CHANNEL_MASK 0x3
void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
struct kvm_pit *kvm_create_pit(struct kvm *kvm); struct kvm_pit *kvm_create_pit(struct kvm *kvm);
void kvm_free_pit(struct kvm *kvm); void kvm_free_pit(struct kvm *kvm);
......
...@@ -30,6 +30,19 @@ ...@@ -30,6 +30,19 @@
#include <linux/kvm_host.h> #include <linux/kvm_host.h>
static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
{
s->isr &= ~(1 << irq);
s->isr_ack |= (1 << irq);
}
void kvm_pic_clear_isr_ack(struct kvm *kvm)
{
struct kvm_pic *s = pic_irqchip(kvm);
s->pics[0].isr_ack = 0xff;
s->pics[1].isr_ack = 0xff;
}
/* /*
* set irq level. If an edge is detected, then the IRR is set to 1 * set irq level. If an edge is detected, then the IRR is set to 1
*/ */
...@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level) ...@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
*/ */
static inline void pic_intack(struct kvm_kpic_state *s, int irq) static inline void pic_intack(struct kvm_kpic_state *s, int irq)
{ {
s->isr |= 1 << irq;
if (s->auto_eoi) { if (s->auto_eoi) {
if (s->rotate_on_auto_eoi) if (s->rotate_on_auto_eoi)
s->priority_add = (irq + 1) & 7; s->priority_add = (irq + 1) & 7;
} else pic_clear_isr(s, irq);
s->isr |= (1 << irq); }
/* /*
* We don't clear a level sensitive interrupt here * We don't clear a level sensitive interrupt here
*/ */
...@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq) ...@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
s->irr &= ~(1 << irq); s->irr &= ~(1 << irq);
} }
int kvm_pic_read_irq(struct kvm_pic *s) int kvm_pic_read_irq(struct kvm *kvm)
{ {
int irq, irq2, intno; int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
irq = pic_get_irq(&s->pics[0]); irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) { if (irq >= 0) {
...@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s) ...@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s)
intno = s->pics[0].irq_base + irq; intno = s->pics[0].irq_base + irq;
} }
pic_update_irq(s); pic_update_irq(s);
kvm_notify_acked_irq(kvm, irq);
return intno; return intno;
} }
void kvm_pic_reset(struct kvm_kpic_state *s) void kvm_pic_reset(struct kvm_kpic_state *s)
{ {
int irq, irqbase;
struct kvm *kvm = s->pics_state->irq_request_opaque;
struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
if (s == &s->pics_state->pics[0])
irqbase = 0;
else
irqbase = 8;
for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
if (s->irr & (1 << irq) || s->isr & (1 << irq))
kvm_notify_acked_irq(kvm, irq+irqbase);
}
s->last_irr = 0; s->last_irr = 0;
s->irr = 0; s->irr = 0;
s->imr = 0; s->imr = 0;
s->isr = 0; s->isr = 0;
s->isr_ack = 0xff;
s->priority_add = 0; s->priority_add = 0;
s->irq_base = 0; s->irq_base = 0;
s->read_reg_select = 0; s->read_reg_select = 0;
...@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) ...@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
priority = get_priority(s, s->isr); priority = get_priority(s, s->isr);
if (priority != 8) { if (priority != 8) {
irq = (priority + s->priority_add) & 7; irq = (priority + s->priority_add) & 7;
s->isr &= ~(1 << irq); pic_clear_isr(s, irq);
if (cmd == 5) if (cmd == 5)
s->priority_add = (irq + 1) & 7; s->priority_add = (irq + 1) & 7;
pic_update_irq(s->pics_state); pic_update_irq(s->pics_state);
...@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) ...@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
break; break;
case 3: case 3:
irq = val & 7; irq = val & 7;
s->isr &= ~(1 << irq); pic_clear_isr(s, irq);
pic_update_irq(s->pics_state); pic_update_irq(s->pics_state);
break; break;
case 6: case 6:
...@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) ...@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
break; break;
case 7: case 7:
irq = val & 7; irq = val & 7;
s->isr &= ~(1 << irq);
s->priority_add = (irq + 1) & 7; s->priority_add = (irq + 1) & 7;
pic_clear_isr(s, irq);
pic_update_irq(s->pics_state); pic_update_irq(s->pics_state);
break; break;
default: default:
...@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) ...@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
s->pics_state->pics[0].irr &= ~(1 << 2); s->pics_state->pics[0].irr &= ~(1 << 2);
} }
s->irr &= ~(1 << ret); s->irr &= ~(1 << ret);
s->isr &= ~(1 << ret); pic_clear_isr(s, ret);
if (addr1 >> 7 || ret != 2) if (addr1 >> 7 || ret != 2)
pic_update_irq(s->pics_state); pic_update_irq(s->pics_state);
} else { } else {
...@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level) ...@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level)
{ {
struct kvm *kvm = opaque; struct kvm *kvm = opaque;
struct kvm_vcpu *vcpu = kvm->vcpus[0]; struct kvm_vcpu *vcpu = kvm->vcpus[0];
struct kvm_pic *s = pic_irqchip(kvm);
int irq = pic_get_irq(&s->pics[0]);
pic_irqchip(kvm)->output = level; s->output = level;
if (vcpu) if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
s->pics[0].isr_ack &= ~(1 << irq);
kvm_vcpu_kick(vcpu); kvm_vcpu_kick(vcpu);
}
} }
struct kvm_pic *kvm_create_pic(struct kvm *kvm) struct kvm_pic *kvm_create_pic(struct kvm *kvm)
......
...@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) ...@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
if (kvm_apic_accept_pic_intr(v)) { if (kvm_apic_accept_pic_intr(v)) {
s = pic_irqchip(v->kvm); s = pic_irqchip(v->kvm);
s->output = 0; /* PIC */ s->output = 0; /* PIC */
vector = kvm_pic_read_irq(s); vector = kvm_pic_read_irq(v->kvm);
} }
} }
return vector; return vector;
...@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); ...@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
{ {
kvm_apic_timer_intr_post(vcpu, vec); kvm_apic_timer_intr_post(vcpu, vec);
kvm_pit_timer_intr_post(vcpu, vec);
/* TODO: PIT, RTC etc. */ /* TODO: PIT, RTC etc. */
} }
EXPORT_SYMBOL_GPL(kvm_timer_intr_post); EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
......
...@@ -42,6 +42,7 @@ struct kvm_kpic_state { ...@@ -42,6 +42,7 @@ struct kvm_kpic_state {
u8 irr; /* interrupt request register */ u8 irr; /* interrupt request register */
u8 imr; /* interrupt mask register */ u8 imr; /* interrupt mask register */
u8 isr; /* interrupt service register */ u8 isr; /* interrupt service register */
u8 isr_ack; /* interrupt ack detection */
u8 priority_add; /* highest irq priority */ u8 priority_add; /* highest irq priority */
u8 irq_base; u8 irq_base;
u8 read_reg_select; u8 read_reg_select;
...@@ -63,12 +64,13 @@ struct kvm_pic { ...@@ -63,12 +64,13 @@ struct kvm_pic {
void *irq_request_opaque; void *irq_request_opaque;
int output; /* intr from master PIC */ int output; /* intr from master PIC */
struct kvm_io_device dev; struct kvm_io_device dev;
void (*ack_notifier)(void *opaque, int irq);
}; };
struct kvm_pic *kvm_create_pic(struct kvm *kvm); struct kvm_pic *kvm_create_pic(struct kvm *kvm);
void kvm_pic_set_irq(void *opaque, int irq, int level); int kvm_pic_read_irq(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm_pic *s);
void kvm_pic_update_irq(struct kvm_pic *s); void kvm_pic_update_irq(struct kvm_pic *s);
void kvm_pic_clear_isr_ack(struct kvm *kvm);
static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
{ {
......
#ifndef ASM_KVM_CACHE_REGS_H
#define ASM_KVM_CACHE_REGS_H
static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
kvm_x86_ops->cache_reg(vcpu, reg);
return vcpu->arch.regs[reg];
}
static inline void kvm_register_write(struct kvm_vcpu *vcpu,
enum kvm_reg reg,
unsigned long val)
{
vcpu->arch.regs[reg] = val;
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
}
static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
{
return kvm_register_read(vcpu, VCPU_REGS_RIP);
}
static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
{
kvm_register_write(vcpu, VCPU_REGS_RIP, val);
}
#endif
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include <asm/current.h> #include <asm/current.h>
#include <asm/apicdef.h> #include <asm/apicdef.h>
#include <asm/atomic.h> #include <asm/atomic.h>
#include "kvm_cache_regs.h"
#include "irq.h" #include "irq.h"
#define PRId64 "d" #define PRId64 "d"
...@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, ...@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
} else } else
apic_clear_vector(vector, apic->regs + APIC_TMR); apic_clear_vector(vector, apic->regs + APIC_TMR);
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
kvm_vcpu_kick(vcpu); kvm_vcpu_kick(vcpu);
else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
}
result = (orig_irr == 0); result = (orig_irr == 0);
break; break;
...@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, ...@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
kvm_vcpu_kick(vcpu); kvm_vcpu_kick(vcpu);
} else { } else {
printk(KERN_DEBUG apic_debug("Ignoring de-assert INIT to vcpu %d\n",
"Ignoring de-assert INIT to vcpu %d\n",
vcpu->vcpu_id); vcpu->vcpu_id);
} }
break; break;
case APIC_DM_STARTUP: case APIC_DM_STARTUP:
printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", apic_debug("SIPI to vcpu %d vector 0x%02x\n",
vcpu->vcpu_id, vector); vcpu->vcpu_id, vector);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
vcpu->arch.sipi_vector = vector; vcpu->arch.sipi_vector = vector;
vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
if (waitqueue_active(&vcpu->wq)) kvm_vcpu_kick(vcpu);
wake_up_interruptible(&vcpu->wq);
} }
break; break;
...@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, ...@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
static void apic_set_eoi(struct kvm_lapic *apic) static void apic_set_eoi(struct kvm_lapic *apic)
{ {
int vector = apic_find_highest_isr(apic); int vector = apic_find_highest_isr(apic);
int trigger_mode;
/* /*
* Not every write EOI will has corresponding ISR, * Not every write EOI will has corresponding ISR,
* one example is when Kernel check timer on setup_IO_APIC * one example is when Kernel check timer on setup_IO_APIC
...@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic) ...@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic)
apic_update_ppr(apic); apic_update_ppr(apic);
if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
} }
static void apic_send_ipi(struct kvm_lapic *apic) static void apic_send_ipi(struct kvm_lapic *apic)
...@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) ...@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
struct kvm_run *run = vcpu->run; struct kvm_run *run = vcpu->run;
set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
kvm_x86_ops->cache_regs(vcpu); run->tpr_access.rip = kvm_rip_read(vcpu);
run->tpr_access.rip = vcpu->arch.rip;
run->tpr_access.is_write = write; run->tpr_access.is_write = write;
} }
...@@ -683,8 +677,8 @@ static void apic_mmio_write(struct kvm_io_device *this, ...@@ -683,8 +677,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
* Refer SDM 8.4.1 * Refer SDM 8.4.1
*/ */
if (len != 4 || alignment) { if (len != 4 || alignment) {
if (printk_ratelimit()) /* Don't shout loud, $infamous_os would cause only noise. */
printk(KERN_ERR "apic write: bad size=%d %lx\n", apic_debug("apic write: bad size=%d %lx\n",
len, (long)address); len, (long)address);
return; return;
} }
...@@ -947,10 +941,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic) ...@@ -947,10 +941,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
if(!atomic_inc_and_test(&apic->timer.pending)) if(!atomic_inc_and_test(&apic->timer.pending))
set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
if (waitqueue_active(q)) { if (waitqueue_active(q))
apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
wake_up_interruptible(q); wake_up_interruptible(q);
}
if (apic_lvtt_period(apic)) { if (apic_lvtt_period(apic)) {
result = 1; result = 1;
apic->timer.dev.expires = ktime_add_ns( apic->timer.dev.expires = ktime_add_ns(
......
...@@ -70,6 +70,9 @@ static int dbg = 0; ...@@ -70,6 +70,9 @@ static int dbg = 0;
module_param(dbg, bool, 0644); module_param(dbg, bool, 0644);
#endif #endif
static int oos_shadow = 1;
module_param(oos_shadow, bool, 0644);
#ifndef MMU_DEBUG #ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0) #define ASSERT(x) do { } while (0)
#else #else
...@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644); ...@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644);
#define ACC_USER_MASK PT_USER_MASK #define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
struct kvm_pv_mmu_op_buffer { #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
void *ptr;
unsigned len;
unsigned processed;
char buf[512] __aligned(sizeof(long));
};
struct kvm_rmap_desc { struct kvm_rmap_desc {
u64 *shadow_ptes[RMAP_EXT]; u64 *shadow_ptes[RMAP_EXT];
struct kvm_rmap_desc *more; struct kvm_rmap_desc *more;
}; };
struct kvm_shadow_walk {
int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
u64 addr, u64 *spte, int level);
};
struct kvm_unsync_walk {
int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
};
typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
static struct kmem_cache *pte_chain_cache; static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_header_cache; static struct kmem_cache *mmu_page_header_cache;
...@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) ...@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
{ {
struct vm_area_struct *vma; struct vm_area_struct *vma;
unsigned long addr; unsigned long addr;
int ret = 0;
addr = gfn_to_hva(kvm, gfn); addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr)) if (kvm_is_error_hva(addr))
return 0; return ret;
down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, addr); vma = find_vma(current->mm, addr);
if (vma && is_vm_hugetlb_page(vma)) if (vma && is_vm_hugetlb_page(vma))
return 1; ret = 1;
up_read(&current->mm->mmap_sem);
return 0; return ret;
} }
static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
...@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) ...@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
if (write_protected) if (write_protected)
kvm_flush_remote_tlbs(kvm); kvm_flush_remote_tlbs(kvm);
account_shadowed(kvm, gfn);
} }
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
...@@ -859,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, ...@@ -859,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
BUG(); BUG();
} }
static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
mmu_parent_walk_fn fn)
{
struct kvm_pte_chain *pte_chain;
struct hlist_node *node;
struct kvm_mmu_page *parent_sp;
int i;
if (!sp->multimapped && sp->parent_pte) {
parent_sp = page_header(__pa(sp->parent_pte));
fn(vcpu, parent_sp);
mmu_parent_walk(vcpu, parent_sp, fn);
return;
}
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
if (!pte_chain->parent_ptes[i])
break;
parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
fn(vcpu, parent_sp);
mmu_parent_walk(vcpu, parent_sp, fn);
}
}
static void kvm_mmu_update_unsync_bitmap(u64 *spte)
{
unsigned int index;
struct kvm_mmu_page *sp = page_header(__pa(spte));
index = spte - sp->spt;
__set_bit(index, sp->unsync_child_bitmap);
sp->unsync_children = 1;
}
static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
{
struct kvm_pte_chain *pte_chain;
struct hlist_node *node;
int i;
if (!sp->parent_pte)
return;
if (!sp->multimapped) {
kvm_mmu_update_unsync_bitmap(sp->parent_pte);
return;
}
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
if (!pte_chain->parent_ptes[i])
break;
kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
}
}
static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
sp->unsync_children = 1;
kvm_mmu_update_parents_unsync(sp);
return 1;
}
static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
mmu_parent_walk(vcpu, sp, unsync_walk_fn);
kvm_mmu_update_parents_unsync(sp);
}
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp) struct kvm_mmu_page *sp)
{ {
...@@ -868,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, ...@@ -868,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
sp->spt[i] = shadow_trap_nonpresent_pte; sp->spt[i] = shadow_trap_nonpresent_pte;
} }
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
return 1;
}
static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
}
#define for_each_unsync_children(bitmap, idx) \
for (idx = find_first_bit(bitmap, 512); \
idx < 512; \
idx = find_next_bit(bitmap, 512, idx+1))
static int mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_unsync_walk *walker)
{
int i, ret;
if (!sp->unsync_children)
return 0;
for_each_unsync_children(sp->unsync_child_bitmap, i) {
u64 ent = sp->spt[i];
if (is_shadow_present_pte(ent)) {
struct kvm_mmu_page *child;
child = page_header(ent & PT64_BASE_ADDR_MASK);
if (child->unsync_children) {
ret = mmu_unsync_walk(child, walker);
if (ret)
return ret;
__clear_bit(i, sp->unsync_child_bitmap);
}
if (child->unsync) {
ret = walker->entry(child, walker);
__clear_bit(i, sp->unsync_child_bitmap);
if (ret)
return ret;
}
}
}
if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
sp->unsync_children = 0;
return 0;
}
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
{ {
unsigned index; unsigned index;
...@@ -888,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) ...@@ -888,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
return NULL; return NULL;
} }
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
sp->unsync = 0;
--kvm->stat.mmu_unsync;
}
static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
if (sp->role.glevels != vcpu->arch.mmu.root_level) {
kvm_mmu_zap_page(vcpu->kvm, sp);
return 1;
}
rmap_write_protect(vcpu->kvm, sp->gfn);
if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_zap_page(vcpu->kvm, sp);
return 1;
}
kvm_mmu_flush_tlb(vcpu);
kvm_unlink_unsync_page(vcpu->kvm, sp);
return 0;
}
struct sync_walker {
struct kvm_vcpu *vcpu;
struct kvm_unsync_walk walker;
};
static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
{
struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
walker);
struct kvm_vcpu *vcpu = sync_walk->vcpu;
kvm_sync_page(vcpu, sp);
return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
}
static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
struct sync_walker walker = {
.walker = { .entry = mmu_sync_fn, },
.vcpu = vcpu,
};
while (mmu_unsync_walk(sp, &walker.walker))
cond_resched_lock(&vcpu->kvm->mmu_lock);
}
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn, gfn_t gfn,
gva_t gaddr, gva_t gaddr,
...@@ -901,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, ...@@ -901,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
unsigned quadrant; unsigned quadrant;
struct hlist_head *bucket; struct hlist_head *bucket;
struct kvm_mmu_page *sp; struct kvm_mmu_page *sp;
struct hlist_node *node; struct hlist_node *node, *tmp;
role.word = 0; role.word = 0;
role.glevels = vcpu->arch.mmu.root_level; role.glevels = vcpu->arch.mmu.root_level;
...@@ -917,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, ...@@ -917,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn, role.word); gfn, role.word);
index = kvm_page_table_hashfn(gfn); index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index]; bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry(sp, node, bucket, hash_link) hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
if (sp->gfn == gfn && sp->role.word == role.word) { if (sp->gfn == gfn) {
if (sp->unsync)
if (kvm_sync_page(vcpu, sp))
continue;
if (sp->role.word != role.word)
continue;
mmu_page_add_parent_pte(vcpu, sp, parent_pte); mmu_page_add_parent_pte(vcpu, sp, parent_pte);
if (sp->unsync_children) {
set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
kvm_mmu_mark_parents_unsync(vcpu, sp);
}
pgprintk("%s: found\n", __func__); pgprintk("%s: found\n", __func__);
return sp; return sp;
} }
...@@ -931,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, ...@@ -931,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
sp->gfn = gfn; sp->gfn = gfn;
sp->role = role; sp->role = role;
hlist_add_head(&sp->hash_link, bucket); hlist_add_head(&sp->hash_link, bucket);
if (!metaphysical) if (!metaphysical) {
rmap_write_protect(vcpu->kvm, gfn); rmap_write_protect(vcpu->kvm, gfn);
account_shadowed(vcpu->kvm, gfn);
}
if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
vcpu->arch.mmu.prefetch_page(vcpu, sp); vcpu->arch.mmu.prefetch_page(vcpu, sp);
else else
...@@ -940,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, ...@@ -940,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
return sp; return sp;
} }
static int walk_shadow(struct kvm_shadow_walk *walker,
struct kvm_vcpu *vcpu, u64 addr)
{
hpa_t shadow_addr;
int level;
int r;
u64 *sptep;
unsigned index;
shadow_addr = vcpu->arch.mmu.root_hpa;
level = vcpu->arch.mmu.shadow_root_level;
if (level == PT32E_ROOT_LEVEL) {
shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
shadow_addr &= PT64_BASE_ADDR_MASK;
--level;
}
while (level >= PT_PAGE_TABLE_LEVEL) {
index = SHADOW_PT_INDEX(addr, level);
sptep = ((u64 *)__va(shadow_addr)) + index;
r = walker->entry(walker, vcpu, addr, sptep, level);
if (r)
return r;
shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
--level;
}
return 0;
}
static void kvm_mmu_page_unlink_children(struct kvm *kvm, static void kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *sp) struct kvm_mmu_page *sp)
{ {
...@@ -955,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, ...@@ -955,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
rmap_remove(kvm, &pt[i]); rmap_remove(kvm, &pt[i]);
pt[i] = shadow_trap_nonpresent_pte; pt[i] = shadow_trap_nonpresent_pte;
} }
kvm_flush_remote_tlbs(kvm);
return; return;
} }
...@@ -974,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, ...@@ -974,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
} }
pt[i] = shadow_trap_nonpresent_pte; pt[i] = shadow_trap_nonpresent_pte;
} }
kvm_flush_remote_tlbs(kvm);
} }
static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
...@@ -991,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) ...@@ -991,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
kvm->vcpus[i]->arch.last_pte_updated = NULL; kvm->vcpus[i]->arch.last_pte_updated = NULL;
} }
static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{ {
u64 *parent_pte; u64 *parent_pte;
++kvm->stat.mmu_shadow_zapped;
while (sp->multimapped || sp->parent_pte) { while (sp->multimapped || sp->parent_pte) {
if (!sp->multimapped) if (!sp->multimapped)
parent_pte = sp->parent_pte; parent_pte = sp->parent_pte;
...@@ -1010,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) ...@@ -1010,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_put_page(sp, parent_pte); kvm_mmu_put_page(sp, parent_pte);
set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
} }
}
struct zap_walker {
struct kvm_unsync_walk walker;
struct kvm *kvm;
int zapped;
};
static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
{
struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
walker);
kvm_mmu_zap_page(zap_walk->kvm, sp);
zap_walk->zapped = 1;
return 0;
}
static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct zap_walker walker = {
.walker = { .entry = mmu_zap_fn, },
.kvm = kvm,
.zapped = 0,
};
if (sp->role.level == PT_PAGE_TABLE_LEVEL)
return 0;
mmu_unsync_walk(sp, &walker.walker);
return walker.zapped;
}
static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
int ret;
++kvm->stat.mmu_shadow_zapped;
ret = mmu_zap_unsync_children(kvm, sp);
kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp);
if (!sp->root_count) { kvm_mmu_unlink_parents(kvm, sp);
if (!sp->role.metaphysical && !sp->role.invalid) kvm_flush_remote_tlbs(kvm);
if (!sp->role.invalid && !sp->role.metaphysical)
unaccount_shadowed(kvm, sp->gfn); unaccount_shadowed(kvm, sp->gfn);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
hlist_del(&sp->hash_link); hlist_del(&sp->hash_link);
kvm_mmu_free_page(kvm, sp); kvm_mmu_free_page(kvm, sp);
} else { } else {
int invalid = sp->role.invalid;
list_move(&sp->link, &kvm->arch.active_mmu_pages);
sp->role.invalid = 1; sp->role.invalid = 1;
list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm); kvm_reload_remote_mmus(kvm);
if (!sp->role.metaphysical && !invalid)
unaccount_shadowed(kvm, sp->gfn);
} }
kvm_mmu_reset_last_pte_updated(kvm); kvm_mmu_reset_last_pte_updated(kvm);
return ret;
} }
/* /*
...@@ -1077,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) ...@@ -1077,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
if (sp->gfn == gfn && !sp->role.metaphysical) { if (sp->gfn == gfn && !sp->role.metaphysical) {
pgprintk("%s: gfn %lx role %x\n", __func__, gfn, pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
sp->role.word); sp->role.word);
kvm_mmu_zap_page(kvm, sp);
r = 1; r = 1;
if (kvm_mmu_zap_page(kvm, sp))
n = bucket->first;
} }
return r; return r;
} }
...@@ -1101,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) ...@@ -1101,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
__set_bit(slot, &sp->slot_bitmap); __set_bit(slot, &sp->slot_bitmap);
} }
static void mmu_convert_notrap(struct kvm_mmu_page *sp)
{
int i;
u64 *pt = sp->spt;
if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
return;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
if (pt[i] == shadow_notrap_nonpresent_pte)
set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
}
}
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{ {
struct page *page; struct page *page;
...@@ -1110,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) ...@@ -1110,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
if (gpa == UNMAPPED_GVA) if (gpa == UNMAPPED_GVA)
return NULL; return NULL;
down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
up_read(&current->mm->mmap_sem);
return page; return page;
} }
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
int *ptwrite, int largepage, gfn_t gfn,
pfn_t pfn, bool speculative)
{ {
u64 spte; unsigned index;
int was_rmapped = 0; struct hlist_head *bucket;
int was_writeble = is_writeble_pte(*shadow_pte); struct kvm_mmu_page *s;
struct hlist_node *node, *n;
pgprintk("%s: spte %llx access %x write_fault %d" index = kvm_page_table_hashfn(sp->gfn);
" user_fault %d gfn %lx\n", bucket = &vcpu->kvm->arch.mmu_page_hash[index];
__func__, *shadow_pte, pt_access, /* don't unsync if pagetable is shadowed with multiple roles */
write_fault, user_fault, gfn); hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
if (s->gfn != sp->gfn || s->role.metaphysical)
continue;
if (s->role.word != sp->role.word)
return 1;
}
kvm_mmu_mark_parents_unsync(vcpu, sp);
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
mmu_convert_notrap(sp);
return 0;
}
if (is_rmap_pte(*shadow_pte)) { static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
/* bool can_unsync)
* If we overwrite a PTE page pointer with a 2MB PMD, unlink {
* the parent of the now unreachable PTE. struct kvm_mmu_page *shadow;
*/
if (largepage && !is_large_pte(*shadow_pte)) {
struct kvm_mmu_page *child;
u64 pte = *shadow_pte;
child = page_header(pte & PT64_BASE_ADDR_MASK); shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
mmu_page_remove_parent_pte(child, shadow_pte); if (shadow) {
} else if (pfn != spte_to_pfn(*shadow_pte)) { if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
pgprintk("hfn old %lx new %lx\n", return 1;
spte_to_pfn(*shadow_pte), pfn); if (shadow->unsync)
rmap_remove(vcpu->kvm, shadow_pte); return 0;
} else { if (can_unsync && oos_shadow)
if (largepage) return kvm_unsync_page(vcpu, shadow);
was_rmapped = is_large_pte(*shadow_pte); return 1;
else
was_rmapped = 1;
}
} }
return 0;
}
static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int largepage,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync)
{
u64 spte;
int ret = 0;
/* /*
* We don't set the accessed bit, since we sometimes want to see * We don't set the accessed bit, since we sometimes want to see
* whether the guest actually used the pte (in order to detect * whether the guest actually used the pte (in order to detect
...@@ -1162,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, ...@@ -1162,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
*/ */
spte = shadow_base_present_pte | shadow_dirty_mask; spte = shadow_base_present_pte | shadow_dirty_mask;
if (!speculative) if (!speculative)
pte_access |= PT_ACCESSED_MASK; spte |= shadow_accessed_mask;
if (!dirty) if (!dirty)
pte_access &= ~ACC_WRITE_MASK; pte_access &= ~ACC_WRITE_MASK;
if (pte_access & ACC_EXEC_MASK) if (pte_access & ACC_EXEC_MASK)
...@@ -1178,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, ...@@ -1178,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
if ((pte_access & ACC_WRITE_MASK) if ((pte_access & ACC_WRITE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) { || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
struct kvm_mmu_page *shadow;
if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
ret = 1;
spte = shadow_trap_nonpresent_pte;
goto set_pte;
}
spte |= PT_WRITABLE_MASK; spte |= PT_WRITABLE_MASK;
shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
if (shadow ||
(largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
pgprintk("%s: found shadow page for %lx, marking ro\n", pgprintk("%s: found shadow page for %lx, marking ro\n",
__func__, gfn); __func__, gfn);
ret = 1;
pte_access &= ~ACC_WRITE_MASK; pte_access &= ~ACC_WRITE_MASK;
if (is_writeble_pte(spte)) { if (is_writeble_pte(spte))
spte &= ~PT_WRITABLE_MASK; spte &= ~PT_WRITABLE_MASK;
kvm_x86_ops->tlb_flush(vcpu);
}
if (write_fault)
*ptwrite = 1;
} }
} }
if (pte_access & ACC_WRITE_MASK) if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn); mark_page_dirty(vcpu->kvm, gfn);
pgprintk("%s: setting spte %llx\n", __func__, spte); set_pte:
pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
(spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
(spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
set_shadow_pte(shadow_pte, spte); set_shadow_pte(shadow_pte, spte);
if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) return ret;
&& (spte & PT_PRESENT_MASK)) }
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
int *ptwrite, int largepage, gfn_t gfn,
pfn_t pfn, bool speculative)
{
int was_rmapped = 0;
int was_writeble = is_writeble_pte(*shadow_pte);
pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %lx\n",
__func__, *shadow_pte, pt_access,
write_fault, user_fault, gfn);
if (is_rmap_pte(*shadow_pte)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
* the parent of the now unreachable PTE.
*/
if (largepage && !is_large_pte(*shadow_pte)) {
struct kvm_mmu_page *child;
u64 pte = *shadow_pte;
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, shadow_pte);
} else if (pfn != spte_to_pfn(*shadow_pte)) {
pgprintk("hfn old %lx new %lx\n",
spte_to_pfn(*shadow_pte), pfn);
rmap_remove(vcpu->kvm, shadow_pte);
} else {
if (largepage)
was_rmapped = is_large_pte(*shadow_pte);
else
was_rmapped = 1;
}
}
if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
dirty, largepage, gfn, pfn, speculative, true)) {
if (write_fault)
*ptwrite = 1;
kvm_x86_ops->tlb_flush(vcpu);
}
pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
is_large_pte(*shadow_pte)? "2MB" : "4kB",
is_present_pte(*shadow_pte)?"RW":"R", gfn,
*shadow_pte, shadow_pte);
if (!was_rmapped && is_large_pte(*shadow_pte))
++vcpu->kvm->stat.lpages; ++vcpu->kvm->stat.lpages;
page_header_update_slot(vcpu->kvm, shadow_pte, gfn); page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
...@@ -1230,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) ...@@ -1230,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{ {
} }
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, struct direct_shadow_walk {
int largepage, gfn_t gfn, pfn_t pfn, struct kvm_shadow_walk walker;
int level) pfn_t pfn;
{ int write;
hpa_t table_addr = vcpu->arch.mmu.root_hpa; int largepage;
int pt_write = 0; int pt_write;
};
for (; ; level--) {
u32 index = PT64_INDEX(v, level);
u64 *table;
ASSERT(VALID_PAGE(table_addr));
table = __va(table_addr);
if (level == 1) {
mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
0, write, 1, &pt_write, 0, gfn, pfn, false);
return pt_write;
}
if (largepage && level == 2) {
mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
0, write, 1, &pt_write, 1, gfn, pfn, false);
return pt_write;
}
if (table[index] == shadow_trap_nonpresent_pte) { static int direct_map_entry(struct kvm_shadow_walk *_walk,
struct kvm_mmu_page *new_table; struct kvm_vcpu *vcpu,
u64 addr, u64 *sptep, int level)
{
struct direct_shadow_walk *walk =
container_of(_walk, struct direct_shadow_walk, walker);
struct kvm_mmu_page *sp;
gfn_t pseudo_gfn; gfn_t pseudo_gfn;
gfn_t gfn = addr >> PAGE_SHIFT;
if (level == PT_PAGE_TABLE_LEVEL
|| (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
0, walk->write, 1, &walk->pt_write,
walk->largepage, gfn, walk->pfn, false);
++vcpu->stat.pf_fixed;
return 1;
}
pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) if (*sptep == shadow_trap_nonpresent_pte) {
>> PAGE_SHIFT; pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
v, level - 1, 1, ACC_ALL, sptep);
1, ACC_ALL, &table[index]); if (!sp) {
if (!new_table) {
pgprintk("nonpaging_map: ENOMEM\n"); pgprintk("nonpaging_map: ENOMEM\n");
kvm_release_pfn_clean(pfn); kvm_release_pfn_clean(walk->pfn);
return -ENOMEM; return -ENOMEM;
} }
set_shadow_pte(&table[index], set_shadow_pte(sptep,
__pa(new_table->spt) __pa(sp->spt)
| PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_PRESENT_MASK | PT_WRITABLE_MASK
| shadow_user_mask | shadow_x_mask); | shadow_user_mask | shadow_x_mask);
} }
table_addr = table[index] & PT64_BASE_ADDR_MASK; return 0;
} }
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
int largepage, gfn_t gfn, pfn_t pfn)
{
int r;
struct direct_shadow_walk walker = {
.walker = { .entry = direct_map_entry, },
.pfn = pfn,
.largepage = largepage,
.write = write,
.pt_write = 0,
};
r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
if (r < 0)
return r;
return walker.pt_write;
} }
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
...@@ -1287,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) ...@@ -1287,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
pfn_t pfn; pfn_t pfn;
unsigned long mmu_seq; unsigned long mmu_seq;
down_read(&current->mm->mmap_sem);
if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1); gfn &= ~(KVM_PAGES_PER_HPAGE-1);
largepage = 1; largepage = 1;
} }
mmu_seq = vcpu->kvm->mmu_notifier_seq; mmu_seq = vcpu->kvm->mmu_notifier_seq;
/* implicit mb(), we'll read before PT lock is unlocked */ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn); pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);
/* mmio */ /* mmio */
if (is_error_pfn(pfn)) { if (is_error_pfn(pfn)) {
...@@ -1308,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) ...@@ -1308,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
if (mmu_notifier_retry(vcpu, mmu_seq)) if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock; goto out_unlock;
kvm_mmu_free_some_pages(vcpu); kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, v, write, largepage, gfn, pfn, r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
PT32E_ROOT_LEVEL);
spin_unlock(&vcpu->kvm->mmu_lock); spin_unlock(&vcpu->kvm->mmu_lock);
...@@ -1405,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ...@@ -1405,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
} }
static void mmu_sync_roots(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_mmu_page *sp;
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
return;
}
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
if (root) {
root &= PT64_BASE_ADDR_MASK;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
}
}
}
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
spin_lock(&vcpu->kvm->mmu_lock);
mmu_sync_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
}
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{ {
return vaddr; return vaddr;
...@@ -1446,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, ...@@ -1446,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
if (r) if (r)
return r; return r;
down_read(&current->mm->mmap_sem);
if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1); gfn &= ~(KVM_PAGES_PER_HPAGE-1);
largepage = 1; largepage = 1;
} }
mmu_seq = vcpu->kvm->mmu_notifier_seq; mmu_seq = vcpu->kvm->mmu_notifier_seq;
/* implicit mb(), we'll read before PT lock is unlocked */ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn); pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);
if (is_error_pfn(pfn)) { if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn); kvm_release_pfn_clean(pfn);
return 1; return 1;
...@@ -1464,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, ...@@ -1464,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
goto out_unlock; goto out_unlock;
kvm_mmu_free_some_pages(vcpu); kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); largepage, gfn, pfn);
spin_unlock(&vcpu->kvm->mmu_lock); spin_unlock(&vcpu->kvm->mmu_lock);
return r; return r;
...@@ -1489,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) ...@@ -1489,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
context->gva_to_gpa = nonpaging_gva_to_gpa; context->gva_to_gpa = nonpaging_gva_to_gpa;
context->free = nonpaging_free; context->free = nonpaging_free;
context->prefetch_page = nonpaging_prefetch_page; context->prefetch_page = nonpaging_prefetch_page;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->root_level = 0; context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE; context->root_hpa = INVALID_PAGE;
...@@ -1536,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) ...@@ -1536,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
context->page_fault = paging64_page_fault; context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa; context->gva_to_gpa = paging64_gva_to_gpa;
context->prefetch_page = paging64_prefetch_page; context->prefetch_page = paging64_prefetch_page;
context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg;
context->free = paging_free; context->free = paging_free;
context->root_level = level; context->root_level = level;
context->shadow_root_level = level; context->shadow_root_level = level;
...@@ -1557,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) ...@@ -1557,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
context->gva_to_gpa = paging32_gva_to_gpa; context->gva_to_gpa = paging32_gva_to_gpa;
context->free = paging_free; context->free = paging_free;
context->prefetch_page = paging32_prefetch_page; context->prefetch_page = paging32_prefetch_page;
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
context->root_level = PT32_ROOT_LEVEL; context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE; context->root_hpa = INVALID_PAGE;
...@@ -1576,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) ...@@ -1576,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->page_fault = tdp_page_fault; context->page_fault = tdp_page_fault;
context->free = nonpaging_free; context->free = nonpaging_free;
context->prefetch_page = nonpaging_prefetch_page; context->prefetch_page = nonpaging_prefetch_page;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->shadow_root_level = kvm_x86_ops->get_tdp_level();
context->root_hpa = INVALID_PAGE; context->root_hpa = INVALID_PAGE;
...@@ -1647,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) ...@@ -1647,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock); spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu); kvm_mmu_free_some_pages(vcpu);
mmu_alloc_roots(vcpu); mmu_alloc_roots(vcpu);
mmu_sync_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock); spin_unlock(&vcpu->kvm->mmu_lock);
kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu); kvm_mmu_flush_tlb(vcpu);
...@@ -1767,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ...@@ -1767,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
return; return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
down_read(&current->mm->mmap_sem);
if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1); gfn &= ~(KVM_PAGES_PER_HPAGE-1);
vcpu->arch.update_pte.largepage = 1; vcpu->arch.update_pte.largepage = 1;
} }
vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
/* implicit mb(), we'll read before PT lock is unlocked */ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn); pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);
if (is_error_pfn(pfn)) { if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn); kvm_release_pfn_clean(pfn);
...@@ -1837,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ...@@ -1837,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
index = kvm_page_table_hashfn(gfn); index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index]; bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
if (sp->gfn != gfn || sp->role.metaphysical) if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
continue; continue;
pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
...@@ -1855,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ...@@ -1855,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
*/ */
pgprintk("misaligned: gpa %llx bytes %d role %x\n", pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word); gpa, bytes, sp->role.word);
kvm_mmu_zap_page(vcpu->kvm, sp); if (kvm_mmu_zap_page(vcpu->kvm, sp))
n = bucket->first;
++vcpu->kvm->stat.mmu_flooded; ++vcpu->kvm->stat.mmu_flooded;
continue; continue;
} }
...@@ -1969,6 +2350,16 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) ...@@ -1969,6 +2350,16 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
} }
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
spin_lock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.invlpg(vcpu, gva);
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_mmu_flush_tlb(vcpu);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
void kvm_enable_tdp(void) void kvm_enable_tdp(void)
{ {
tdp_enabled = true; tdp_enabled = true;
...@@ -2055,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) ...@@ -2055,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{ {
struct kvm_mmu_page *sp; struct kvm_mmu_page *sp;
spin_lock(&kvm->mmu_lock);
list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
int i; int i;
u64 *pt; u64 *pt;
...@@ -2068,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) ...@@ -2068,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (pt[i] & PT_WRITABLE_MASK) if (pt[i] & PT_WRITABLE_MASK)
pt[i] &= ~PT_WRITABLE_MASK; pt[i] &= ~PT_WRITABLE_MASK;
} }
kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
} }
void kvm_mmu_zap_all(struct kvm *kvm) void kvm_mmu_zap_all(struct kvm *kvm)
...@@ -2076,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm) ...@@ -2076,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm)
spin_lock(&kvm->mmu_lock); spin_lock(&kvm->mmu_lock);
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
kvm_mmu_zap_page(kvm, sp); if (kvm_mmu_zap_page(kvm, sp))
node = container_of(kvm->arch.active_mmu_pages.next,
struct kvm_mmu_page, link);
spin_unlock(&kvm->mmu_lock); spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm); kvm_flush_remote_tlbs(kvm);
...@@ -2291,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, ...@@ -2291,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
gpa_t addr, unsigned long *ret) gpa_t addr, unsigned long *ret)
{ {
int r; int r;
struct kvm_pv_mmu_op_buffer buffer; struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
buffer.ptr = buffer.buf; buffer->ptr = buffer->buf;
buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
buffer.processed = 0; buffer->processed = 0;
r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
if (r) if (r)
goto out; goto out;
while (buffer.len) { while (buffer->len) {
r = kvm_pv_mmu_op_one(vcpu, &buffer); r = kvm_pv_mmu_op_one(vcpu, buffer);
if (r < 0) if (r < 0)
goto out; goto out;
if (r == 0) if (r == 0)
...@@ -2311,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, ...@@ -2311,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
r = 1; r = 1;
out: out:
*ret = buffer.processed; *ret = buffer->processed;
return r; return r;
} }
......
...@@ -25,11 +25,11 @@ ...@@ -25,11 +25,11 @@
#if PTTYPE == 64 #if PTTYPE == 64
#define pt_element_t u64 #define pt_element_t u64
#define guest_walker guest_walker64 #define guest_walker guest_walker64
#define shadow_walker shadow_walker64
#define FNAME(name) paging##64_##name #define FNAME(name) paging##64_##name
#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
#define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS #define PT_LEVEL_BITS PT64_LEVEL_BITS
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
...@@ -42,11 +42,11 @@ ...@@ -42,11 +42,11 @@
#elif PTTYPE == 32 #elif PTTYPE == 32
#define pt_element_t u32 #define pt_element_t u32
#define guest_walker guest_walker32 #define guest_walker guest_walker32
#define shadow_walker shadow_walker32
#define FNAME(name) paging##32_##name #define FNAME(name) paging##32_##name
#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
#define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2 #define PT_MAX_FULL_LEVELS 2
...@@ -73,6 +73,17 @@ struct guest_walker { ...@@ -73,6 +73,17 @@ struct guest_walker {
u32 error_code; u32 error_code;
}; };
struct shadow_walker {
struct kvm_shadow_walk walker;
struct guest_walker *guest_walker;
int user_fault;
int write_fault;
int largepage;
int *ptwrite;
pfn_t pfn;
u64 *sptep;
};
static gfn_t gpte_to_gfn(pt_element_t gpte) static gfn_t gpte_to_gfn(pt_element_t gpte)
{ {
return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
...@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, ...@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
pt_element_t *table; pt_element_t *table;
struct page *page; struct page *page;
down_read(&current->mm->mmap_sem);
page = gfn_to_page(kvm, table_gfn); page = gfn_to_page(kvm, table_gfn);
up_read(&current->mm->mmap_sem);
table = kmap_atomic(page, KM_USER0); table = kmap_atomic(page, KM_USER0);
ret = CMPXCHG(&table[index], orig_pte, new_pte); ret = CMPXCHG(&table[index], orig_pte, new_pte);
kunmap_atomic(table, KM_USER0); kunmap_atomic(table, KM_USER0);
kvm_release_page_dirty(page); kvm_release_page_dirty(page);
...@@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, ...@@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
/* /*
* Fetch a shadow pte for a specific level in the paging hierarchy. * Fetch a shadow pte for a specific level in the paging hierarchy.
*/ */
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
struct guest_walker *walker, struct kvm_vcpu *vcpu, u64 addr,
int user_fault, int write_fault, int largepage, u64 *sptep, int level)
int *ptwrite, pfn_t pfn)
{ {
hpa_t shadow_addr; struct shadow_walker *sw =
int level; container_of(_sw, struct shadow_walker, walker);
u64 *shadow_ent; struct guest_walker *gw = sw->guest_walker;
unsigned access = walker->pt_access; unsigned access = gw->pt_access;
if (!is_present_pte(walker->ptes[walker->level - 1]))
return NULL;
shadow_addr = vcpu->arch.mmu.root_hpa;
level = vcpu->arch.mmu.shadow_root_level;
if (level == PT32E_ROOT_LEVEL) {
shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
shadow_addr &= PT64_BASE_ADDR_MASK;
--level;
}
for (; ; level--) {
u32 index = SHADOW_PT_INDEX(addr, level);
struct kvm_mmu_page *shadow_page; struct kvm_mmu_page *shadow_page;
u64 shadow_pte; u64 spte;
int metaphysical; int metaphysical;
gfn_t table_gfn; gfn_t table_gfn;
int r;
pt_element_t curr_pte;
shadow_ent = ((u64 *)__va(shadow_addr)) + index; if (level == PT_PAGE_TABLE_LEVEL
if (level == PT_PAGE_TABLE_LEVEL) || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
break; mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
sw->user_fault, sw->write_fault,
gw->ptes[gw->level-1] & PT_DIRTY_MASK,
sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
false);
sw->sptep = sptep;
return 1;
}
if (largepage && level == PT_DIRECTORY_LEVEL) if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
break; return 0;
if (is_shadow_present_pte(*shadow_ent) if (is_large_pte(*sptep)) {
&& !is_large_pte(*shadow_ent)) { set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; kvm_flush_remote_tlbs(vcpu->kvm);
continue; rmap_remove(vcpu->kvm, sptep);
} }
if (is_large_pte(*shadow_ent)) if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
rmap_remove(vcpu->kvm, shadow_ent);
if (level - 1 == PT_PAGE_TABLE_LEVEL
&& walker->level == PT_DIRECTORY_LEVEL) {
metaphysical = 1; metaphysical = 1;
if (!is_dirty_pte(walker->ptes[level - 1])) if (!is_dirty_pte(gw->ptes[level - 1]))
access &= ~ACC_WRITE_MASK; access &= ~ACC_WRITE_MASK;
table_gfn = gpte_to_gfn(walker->ptes[level - 1]); table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
} else { } else {
metaphysical = 0; metaphysical = 0;
table_gfn = walker->table_gfn[level - 2]; table_gfn = gw->table_gfn[level - 2];
} }
shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
metaphysical, access, metaphysical, access, sptep);
shadow_ent);
if (!metaphysical) { if (!metaphysical) {
int r; r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
pt_element_t curr_pte;
r = kvm_read_guest_atomic(vcpu->kvm,
walker->pte_gpa[level - 2],
&curr_pte, sizeof(curr_pte)); &curr_pte, sizeof(curr_pte));
if (r || curr_pte != walker->ptes[level - 2]) { if (r || curr_pte != gw->ptes[level - 2]) {
kvm_release_pfn_clean(pfn); kvm_release_pfn_clean(sw->pfn);
return NULL; sw->sptep = NULL;
return 1;
} }
} }
shadow_addr = __pa(shadow_page->spt);
shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
| PT_WRITABLE_MASK | PT_USER_MASK; | PT_WRITABLE_MASK | PT_USER_MASK;
set_shadow_pte(shadow_ent, shadow_pte); *sptep = spte;
} return 0;
}
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *guest_walker,
int user_fault, int write_fault, int largepage,
int *ptwrite, pfn_t pfn)
{
struct shadow_walker walker = {
.walker = { .entry = FNAME(shadow_walk_entry), },
.guest_walker = guest_walker,
.user_fault = user_fault,
.write_fault = write_fault,
.largepage = largepage,
.ptwrite = ptwrite,
.pfn = pfn,
};
if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
return NULL;
mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, walk_shadow(&walker.walker, vcpu, addr);
user_fault, write_fault,
walker->ptes[walker->level-1] & PT_DIRTY_MASK,
ptwrite, largepage, walker->gfn, pfn, false);
return shadow_ent; return walker.sptep;
} }
/* /*
...@@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, ...@@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
return 0; return 0;
} }
down_read(&current->mm->mmap_sem);
if (walker.level == PT_DIRECTORY_LEVEL) { if (walker.level == PT_DIRECTORY_LEVEL) {
gfn_t large_gfn; gfn_t large_gfn;
large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
...@@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, ...@@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
} }
} }
mmu_seq = vcpu->kvm->mmu_notifier_seq; mmu_seq = vcpu->kvm->mmu_notifier_seq;
/* implicit mb(), we'll read before PT lock is unlocked */ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
up_read(&current->mm->mmap_sem);
/* mmio */ /* mmio */
if (is_error_pfn(pfn)) { if (is_error_pfn(pfn)) {
...@@ -453,6 +461,31 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, ...@@ -453,6 +461,31 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
return 0; return 0;
} }
static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
struct kvm_vcpu *vcpu, u64 addr,
u64 *sptep, int level)
{
if (level == PT_PAGE_TABLE_LEVEL) {
if (is_shadow_present_pte(*sptep))
rmap_remove(vcpu->kvm, sptep);
set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
return 1;
}
if (!is_shadow_present_pte(*sptep))
return 1;
return 0;
}
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
struct shadow_walker walker = {
.walker = { .entry = FNAME(shadow_invlpg_entry), },
};
walk_shadow(&walker.walker, vcpu, gva);
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
{ {
struct guest_walker walker; struct guest_walker walker;
...@@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, ...@@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
} }
} }
/*
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
* - Alias changes zap the entire shadow cache.
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
int i, offset, nr_present;
offset = nr_present = 0;
if (PTTYPE == 32)
offset = sp->role.quadrant << PT64_LEVEL_BITS;
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
unsigned pte_access;
pt_element_t gpte;
gpa_t pte_gpa;
gfn_t gfn = sp->gfns[i];
if (!is_shadow_present_pte(sp->spt[i]))
continue;
pte_gpa = gfn_to_gpa(sp->gfn);
pte_gpa += (i+offset) * sizeof(pt_element_t);
if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
sizeof(pt_element_t)))
return -EINVAL;
if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
!(gpte & PT_ACCESSED_MASK)) {
u64 nonpresent;
rmap_remove(vcpu->kvm, &sp->spt[i]);
if (is_present_pte(gpte))
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
set_shadow_pte(&sp->spt[i], nonpresent);
continue;
}
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_pte(gpte), 0, gfn,
spte_to_pfn(sp->spt[i]), true, false);
}
return !nr_present;
}
#undef pt_element_t #undef pt_element_t
#undef guest_walker #undef guest_walker
#undef shadow_walker
#undef FNAME #undef FNAME
#undef PT_BASE_ADDR_MASK #undef PT_BASE_ADDR_MASK
#undef PT_INDEX #undef PT_INDEX
#undef SHADOW_PT_INDEX
#undef PT_LEVEL_MASK #undef PT_LEVEL_MASK
#undef PT_DIR_BASE_ADDR_MASK #undef PT_DIR_BASE_ADDR_MASK
#undef PT_LEVEL_BITS #undef PT_LEVEL_BITS
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "kvm_svm.h" #include "kvm_svm.h"
#include "irq.h" #include "irq.h"
#include "mmu.h" #include "mmu.h"
#include "kvm_cache_regs.h"
#include <linux/module.h> #include <linux/module.h>
#include <linux/kernel.h> #include <linux/kernel.h>
...@@ -35,10 +36,6 @@ MODULE_LICENSE("GPL"); ...@@ -35,10 +36,6 @@ MODULE_LICENSE("GPL");
#define IOPM_ALLOC_ORDER 2 #define IOPM_ALLOC_ORDER 2
#define MSRPM_ALLOC_ORDER 1 #define MSRPM_ALLOC_ORDER 1
#define DB_VECTOR 1
#define UD_VECTOR 6
#define GP_VECTOR 13
#define DR7_GD_MASK (1 << 13) #define DR7_GD_MASK (1 << 13)
#define DR6_BD_MASK (1 << 13) #define DR6_BD_MASK (1 << 13)
...@@ -47,7 +44,7 @@ MODULE_LICENSE("GPL"); ...@@ -47,7 +44,7 @@ MODULE_LICENSE("GPL");
#define SVM_FEATURE_NPT (1 << 0) #define SVM_FEATURE_NPT (1 << 0)
#define SVM_FEATURE_LBRV (1 << 1) #define SVM_FEATURE_LBRV (1 << 1)
#define SVM_DEATURE_SVML (1 << 2) #define SVM_FEATURE_SVML (1 << 2)
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
...@@ -236,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) ...@@ -236,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
printk(KERN_DEBUG "%s: NOP\n", __func__); printk(KERN_DEBUG "%s: NOP\n", __func__);
return; return;
} }
if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
__func__, __func__, kvm_rip_read(vcpu), svm->next_rip);
svm->vmcb->save.rip,
svm->next_rip);
vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; kvm_rip_write(vcpu, svm->next_rip);
svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
vcpu->arch.interrupt_window_open = 1; vcpu->arch.interrupt_window_open = 1;
...@@ -530,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm) ...@@ -530,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm)
(1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_CPUID) |
(1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_INVD) |
(1ULL << INTERCEPT_HLT) | (1ULL << INTERCEPT_HLT) |
(1ULL << INTERCEPT_INVLPG) |
(1ULL << INTERCEPT_INVLPGA) | (1ULL << INTERCEPT_INVLPGA) |
(1ULL << INTERCEPT_IOIO_PROT) | (1ULL << INTERCEPT_IOIO_PROT) |
(1ULL << INTERCEPT_MSR_PROT) | (1ULL << INTERCEPT_MSR_PROT) |
...@@ -581,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm) ...@@ -581,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
save->dr7 = 0x400; save->dr7 = 0x400;
save->rflags = 2; save->rflags = 2;
save->rip = 0x0000fff0; save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
/* /*
* cr0 val on cpu init should be 0x60000010, we enable cpu * cr0 val on cpu init should be 0x60000010, we enable cpu
...@@ -593,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm) ...@@ -593,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm)
if (npt_enabled) { if (npt_enabled) {
/* Setup VMCB for Nested Paging */ /* Setup VMCB for Nested Paging */
control->nested_ctl = 1; control->nested_ctl = 1;
control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
(1ULL << INTERCEPT_INVLPG));
control->intercept_exceptions &= ~(1 << PF_VECTOR); control->intercept_exceptions &= ~(1 << PF_VECTOR);
control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
INTERCEPT_CR3_MASK); INTERCEPT_CR3_MASK);
...@@ -615,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) ...@@ -615,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
init_vmcb(svm); init_vmcb(svm);
if (vcpu->vcpu_id != 0) { if (vcpu->vcpu_id != 0) {
svm->vmcb->save.rip = 0; kvm_rip_write(vcpu, 0);
svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
} }
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
return 0; return 0;
} }
...@@ -721,23 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) ...@@ -721,23 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
rdtscll(vcpu->arch.host_tsc); rdtscll(vcpu->arch.host_tsc);
} }
static void svm_cache_regs(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.rip = svm->vmcb->save.rip;
}
static void svm_decache_regs(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
svm->vmcb->save.rip = vcpu->arch.rip;
}
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{ {
return to_svm(vcpu)->vmcb->save.rflags; return to_svm(vcpu)->vmcb->save.rflags;
...@@ -1040,7 +1023,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) ...@@ -1040,7 +1023,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
if (npt_enabled) if (npt_enabled)
svm_flush_tlb(&svm->vcpu); svm_flush_tlb(&svm->vcpu);
if (event_injection) if (!npt_enabled && event_injection)
kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
} }
...@@ -1139,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) ...@@ -1139,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{ {
svm->next_rip = svm->vmcb->save.rip + 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
skip_emulated_instruction(&svm->vcpu); skip_emulated_instruction(&svm->vcpu);
return kvm_emulate_halt(&svm->vcpu); return kvm_emulate_halt(&svm->vcpu);
} }
static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{ {
svm->next_rip = svm->vmcb->save.rip + 3; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu); skip_emulated_instruction(&svm->vcpu);
kvm_emulate_hypercall(&svm->vcpu); kvm_emulate_hypercall(&svm->vcpu);
return 1; return 1;
...@@ -1178,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm, ...@@ -1178,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm,
static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{ {
svm->next_rip = svm->vmcb->save.rip + 2; svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
kvm_emulate_cpuid(&svm->vcpu); kvm_emulate_cpuid(&svm->vcpu);
return 1; return 1;
} }
static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
return 1;
}
static int emulate_on_interception(struct vcpu_svm *svm, static int emulate_on_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run) struct kvm_run *kvm_run)
{ {
...@@ -1273,9 +1263,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) ...@@ -1273,9 +1263,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
(u32)(data >> 32), handler); (u32)(data >> 32), handler);
svm->vmcb->save.rax = data & 0xffffffff; svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
svm->next_rip = svm->vmcb->save.rip + 2; svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
skip_emulated_instruction(&svm->vcpu); skip_emulated_instruction(&svm->vcpu);
} }
return 1; return 1;
...@@ -1359,13 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) ...@@ -1359,13 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{ {
u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u64 data = (svm->vmcb->save.rax & -1u) u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
handler); handler);
svm->next_rip = svm->vmcb->save.rip + 2; svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
if (svm_set_msr(&svm->vcpu, ecx, data)) if (svm_set_msr(&svm->vcpu, ecx, data))
kvm_inject_gp(&svm->vcpu, 0); kvm_inject_gp(&svm->vcpu, 0);
else else
...@@ -1436,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, ...@@ -1436,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_CPUID] = cpuid_interception,
[SVM_EXIT_INVD] = emulate_on_interception, [SVM_EXIT_INVD] = emulate_on_interception,
[SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_HLT] = halt_interception,
[SVM_EXIT_INVLPG] = emulate_on_interception, [SVM_EXIT_INVLPG] = invlpg_interception,
[SVM_EXIT_INVLPGA] = invalid_op_interception, [SVM_EXIT_INVLPGA] = invalid_op_interception,
[SVM_EXIT_IOIO] = io_interception, [SVM_EXIT_IOIO] = io_interception,
[SVM_EXIT_MSR] = msr_interception, [SVM_EXIT_MSR] = msr_interception,
...@@ -1538,6 +1528,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) ...@@ -1538,6 +1528,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
++svm->vcpu.stat.irq_injections;
control = &svm->vmcb->control; control = &svm->vmcb->control;
control->int_vector = irq; control->int_vector = irq;
control->int_ctl &= ~V_INTR_PRIO_MASK; control->int_ctl &= ~V_INTR_PRIO_MASK;
...@@ -1716,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) ...@@ -1716,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
} }
#ifdef CONFIG_X86_64
#define R "r"
#else
#define R "e"
#endif
static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{ {
struct vcpu_svm *svm = to_svm(vcpu); struct vcpu_svm *svm = to_svm(vcpu);
...@@ -1723,6 +1720,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1723,6 +1720,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
u16 gs_selector; u16 gs_selector;
u16 ldt_selector; u16 ldt_selector;
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
pre_svm_run(svm); pre_svm_run(svm);
sync_lapic_to_cr8(vcpu); sync_lapic_to_cr8(vcpu);
...@@ -1750,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1750,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
local_irq_enable(); local_irq_enable();
asm volatile ( asm volatile (
"push %%"R"bp; \n\t"
"mov %c[rbx](%[svm]), %%"R"bx \n\t"
"mov %c[rcx](%[svm]), %%"R"cx \n\t"
"mov %c[rdx](%[svm]), %%"R"dx \n\t"
"mov %c[rsi](%[svm]), %%"R"si \n\t"
"mov %c[rdi](%[svm]), %%"R"di \n\t"
"mov %c[rbp](%[svm]), %%"R"bp \n\t"
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
"push %%rbp; \n\t"
#else
"push %%ebp; \n\t"
#endif
#ifdef CONFIG_X86_64
"mov %c[rbx](%[svm]), %%rbx \n\t"
"mov %c[rcx](%[svm]), %%rcx \n\t"
"mov %c[rdx](%[svm]), %%rdx \n\t"
"mov %c[rsi](%[svm]), %%rsi \n\t"
"mov %c[rdi](%[svm]), %%rdi \n\t"
"mov %c[rbp](%[svm]), %%rbp \n\t"
"mov %c[r8](%[svm]), %%r8 \n\t" "mov %c[r8](%[svm]), %%r8 \n\t"
"mov %c[r9](%[svm]), %%r9 \n\t" "mov %c[r9](%[svm]), %%r9 \n\t"
"mov %c[r10](%[svm]), %%r10 \n\t" "mov %c[r10](%[svm]), %%r10 \n\t"
...@@ -1771,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1771,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %c[r13](%[svm]), %%r13 \n\t" "mov %c[r13](%[svm]), %%r13 \n\t"
"mov %c[r14](%[svm]), %%r14 \n\t" "mov %c[r14](%[svm]), %%r14 \n\t"
"mov %c[r15](%[svm]), %%r15 \n\t" "mov %c[r15](%[svm]), %%r15 \n\t"
#else
"mov %c[rbx](%[svm]), %%ebx \n\t"
"mov %c[rcx](%[svm]), %%ecx \n\t"
"mov %c[rdx](%[svm]), %%edx \n\t"
"mov %c[rsi](%[svm]), %%esi \n\t"
"mov %c[rdi](%[svm]), %%edi \n\t"
"mov %c[rbp](%[svm]), %%ebp \n\t"
#endif #endif
#ifdef CONFIG_X86_64
/* Enter guest mode */
"push %%rax \n\t"
"mov %c[vmcb](%[svm]), %%rax \n\t"
__ex(SVM_VMLOAD) "\n\t"
__ex(SVM_VMRUN) "\n\t"
__ex(SVM_VMSAVE) "\n\t"
"pop %%rax \n\t"
#else
/* Enter guest mode */ /* Enter guest mode */
"push %%eax \n\t" "push %%"R"ax \n\t"
"mov %c[vmcb](%[svm]), %%eax \n\t" "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
__ex(SVM_VMLOAD) "\n\t" __ex(SVM_VMLOAD) "\n\t"
__ex(SVM_VMRUN) "\n\t" __ex(SVM_VMRUN) "\n\t"
__ex(SVM_VMSAVE) "\n\t" __ex(SVM_VMSAVE) "\n\t"
"pop %%eax \n\t" "pop %%"R"ax \n\t"
#endif
/* Save guest registers, load host registers */ /* Save guest registers, load host registers */
"mov %%"R"bx, %c[rbx](%[svm]) \n\t"
"mov %%"R"cx, %c[rcx](%[svm]) \n\t"
"mov %%"R"dx, %c[rdx](%[svm]) \n\t"
"mov %%"R"si, %c[rsi](%[svm]) \n\t"
"mov %%"R"di, %c[rdi](%[svm]) \n\t"
"mov %%"R"bp, %c[rbp](%[svm]) \n\t"
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
"mov %%rbx, %c[rbx](%[svm]) \n\t"
"mov %%rcx, %c[rcx](%[svm]) \n\t"
"mov %%rdx, %c[rdx](%[svm]) \n\t"
"mov %%rsi, %c[rsi](%[svm]) \n\t"
"mov %%rdi, %c[rdi](%[svm]) \n\t"
"mov %%rbp, %c[rbp](%[svm]) \n\t"
"mov %%r8, %c[r8](%[svm]) \n\t" "mov %%r8, %c[r8](%[svm]) \n\t"
"mov %%r9, %c[r9](%[svm]) \n\t" "mov %%r9, %c[r9](%[svm]) \n\t"
"mov %%r10, %c[r10](%[svm]) \n\t" "mov %%r10, %c[r10](%[svm]) \n\t"
...@@ -1814,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1814,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %%r13, %c[r13](%[svm]) \n\t" "mov %%r13, %c[r13](%[svm]) \n\t"
"mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r14, %c[r14](%[svm]) \n\t"
"mov %%r15, %c[r15](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t"
"pop %%rbp; \n\t"
#else
"mov %%ebx, %c[rbx](%[svm]) \n\t"
"mov %%ecx, %c[rcx](%[svm]) \n\t"
"mov %%edx, %c[rdx](%[svm]) \n\t"
"mov %%esi, %c[rsi](%[svm]) \n\t"
"mov %%edi, %c[rdi](%[svm]) \n\t"
"mov %%ebp, %c[rbp](%[svm]) \n\t"
"pop %%ebp; \n\t"
#endif #endif
"pop %%"R"bp"
: :
: [svm]"a"(svm), : [svm]"a"(svm),
[vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
...@@ -1846,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1846,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
[r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
#endif #endif
: "cc", "memory" : "cc", "memory"
, R"bx", R"cx", R"dx", R"si", R"di"
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
, "rbx", "rcx", "rdx", "rsi", "rdi"
, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
#else
, "ebx", "ecx", "edx" , "esi", "edi"
#endif #endif
); );
...@@ -1858,6 +1825,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1858,6 +1825,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
load_db_regs(svm->host_db_regs); load_db_regs(svm->host_db_regs);
vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.cr2 = svm->vmcb->save.cr2;
vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
write_dr6(svm->host_dr6); write_dr6(svm->host_dr6);
write_dr7(svm->host_dr7); write_dr7(svm->host_dr7);
...@@ -1879,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1879,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
svm->next_rip = 0; svm->next_rip = 0;
} }
#undef R
static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{ {
struct vcpu_svm *svm = to_svm(vcpu); struct vcpu_svm *svm = to_svm(vcpu);
...@@ -1977,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = { ...@@ -1977,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_gdt = svm_set_gdt, .set_gdt = svm_set_gdt,
.get_dr = svm_get_dr, .get_dr = svm_get_dr,
.set_dr = svm_set_dr, .set_dr = svm_set_dr,
.cache_regs = svm_cache_regs,
.decache_regs = svm_decache_regs,
.get_rflags = svm_get_rflags, .get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags, .set_rflags = svm_set_rflags,
......
...@@ -26,6 +26,8 @@ ...@@ -26,6 +26,8 @@
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/moduleparam.h> #include <linux/moduleparam.h>
#include "kvm_cache_regs.h"
#include "x86.h"
#include <asm/io.h> #include <asm/io.h>
#include <asm/desc.h> #include <asm/desc.h>
...@@ -47,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0); ...@@ -47,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0);
static int enable_ept = 1; static int enable_ept = 1;
module_param(enable_ept, bool, 0); module_param(enable_ept, bool, 0);
static int emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, 0);
struct vmcs { struct vmcs {
u32 revision_id; u32 revision_id;
u32 abort; u32 abort;
...@@ -56,6 +61,7 @@ struct vmcs { ...@@ -56,6 +61,7 @@ struct vmcs {
struct vcpu_vmx { struct vcpu_vmx {
struct kvm_vcpu vcpu; struct kvm_vcpu vcpu;
struct list_head local_vcpus_link; struct list_head local_vcpus_link;
unsigned long host_rsp;
int launched; int launched;
u8 fail; u8 fail;
u32 idt_vectoring_info; u32 idt_vectoring_info;
...@@ -83,6 +89,7 @@ struct vcpu_vmx { ...@@ -83,6 +89,7 @@ struct vcpu_vmx {
} irq; } irq;
} rmode; } rmode;
int vpid; int vpid;
bool emulation_required;
}; };
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
...@@ -468,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) ...@@ -468,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
if (!vcpu->fpu_active) if (!vcpu->fpu_active)
eb |= 1u << NM_VECTOR; eb |= 1u << NM_VECTOR;
if (vcpu->guest_debug.enabled) if (vcpu->guest_debug.enabled)
eb |= 1u << 1; eb |= 1u << DB_VECTOR;
if (vcpu->arch.rmode.active) if (vcpu->arch.rmode.active)
eb = ~0; eb = ~0;
if (vm_need_ept()) if (vm_need_ept())
...@@ -715,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) ...@@ -715,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
unsigned long rip; unsigned long rip;
u32 interruptibility; u32 interruptibility;
rip = vmcs_readl(GUEST_RIP); rip = kvm_rip_read(vcpu);
rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
vmcs_writel(GUEST_RIP, rip); kvm_rip_write(vcpu, rip);
/* /*
* We emulated an instruction, so temporary interrupt blocking * We emulated an instruction, so temporary interrupt blocking
...@@ -733,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) ...@@ -733,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code) bool has_error_code, u32 error_code)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (has_error_code)
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
if (vcpu->arch.rmode.active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = nr;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
if (nr == BP_VECTOR)
vmx->rmode.irq.rip++;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
nr | INTR_TYPE_SOFT_INTR
| (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
| INTR_INFO_VALID_MASK);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
return;
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
nr | INTR_TYPE_EXCEPTION nr | INTR_TYPE_EXCEPTION
| (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
| INTR_INFO_VALID_MASK); | INTR_INFO_VALID_MASK);
if (has_error_code)
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
} }
static bool vmx_exception_injected(struct kvm_vcpu *vcpu) static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu); return false;
return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
} }
/* /*
...@@ -947,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) ...@@ -947,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
return ret; return ret;
} }
/* static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
* Sync the rsp and rip registers into the vcpu structure. This allows
* registers to be accessed by indexing vcpu->arch.regs.
*/
static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
{ {
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
switch (reg) {
case VCPU_REGS_RSP:
vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
vcpu->arch.rip = vmcs_readl(GUEST_RIP); break;
} case VCPU_REGS_RIP:
vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
/* break;
* Syncs rsp and rip back into the vmcs. Should be called after possible default:
* modification. break;
*/ }
static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
{
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
vmcs_writel(GUEST_RIP, vcpu->arch.rip);
} }
static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
...@@ -1007,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) ...@@ -1007,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
static int vmx_get_irq(struct kvm_vcpu *vcpu) static int vmx_get_irq(struct kvm_vcpu *vcpu)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu); if (!vcpu->arch.interrupt.pending)
u32 idtv_info_field;
idtv_info_field = vmx->idt_vectoring_info;
if (idtv_info_field & INTR_INFO_VALID_MASK) {
if (is_external_interrupt(idtv_info_field))
return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
else
printk(KERN_DEBUG "pending exception: not handled yet\n");
}
return -1; return -1;
return vcpu->arch.interrupt.nr;
} }
static __init int cpu_has_kvm_support(void) static __init int cpu_has_kvm_support(void)
...@@ -1031,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void) ...@@ -1031,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void)
u64 msr; u64 msr;
rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | return (msr & (FEATURE_CONTROL_LOCKED |
MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) FEATURE_CONTROL_VMXON_ENABLED))
== MSR_IA32_FEATURE_CONTROL_LOCKED; == FEATURE_CONTROL_LOCKED;
/* locked but not enabled */ /* locked but not enabled */
} }
...@@ -1045,14 +1055,14 @@ static void hardware_enable(void *garbage) ...@@ -1045,14 +1055,14 @@ static void hardware_enable(void *garbage)
INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
rdmsrl(MSR_IA32_FEATURE_CONTROL, old); rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | if ((old & (FEATURE_CONTROL_LOCKED |
MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) FEATURE_CONTROL_VMXON_ENABLED))
!= (MSR_IA32_FEATURE_CONTROL_LOCKED | != (FEATURE_CONTROL_LOCKED |
MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) FEATURE_CONTROL_VMXON_ENABLED))
/* enable and lock */ /* enable and lock */
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
MSR_IA32_FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LOCKED |
MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); FEATURE_CONTROL_VMXON_ENABLED);
write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
asm volatile (ASM_VMX_VMXON_RAX asm volatile (ASM_VMX_VMXON_RAX
: : "a"(&phys_addr), "m"(phys_addr) : : "a"(&phys_addr), "m"(phys_addr)
...@@ -1120,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) ...@@ -1120,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_CR3_STORE_EXITING | CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_USE_IO_BITMAPS |
CPU_BASED_MOV_DR_EXITING | CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING; CPU_BASED_USE_TSC_OFFSETING |
CPU_BASED_INVLPG_EXITING;
opt = CPU_BASED_TPR_SHADOW | opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
...@@ -1149,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) ...@@ -1149,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
#endif #endif
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses don't need to cause VM Exits when EPT enabled */ /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
enabled */
min &= ~(CPU_BASED_CR3_LOAD_EXITING | min &= ~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING); CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_INVLPG_EXITING);
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
&_cpu_based_exec_control) < 0) &_cpu_based_exec_control) < 0)
return -EIO; return -EIO;
...@@ -1288,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) ...@@ -1288,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
static void enter_pmode(struct kvm_vcpu *vcpu) static void enter_pmode(struct kvm_vcpu *vcpu)
{ {
unsigned long flags; unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
vmx->emulation_required = 1;
vcpu->arch.rmode.active = 0; vcpu->arch.rmode.active = 0;
vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
...@@ -1305,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu) ...@@ -1305,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
update_exception_bitmap(vcpu); update_exception_bitmap(vcpu);
if (emulate_invalid_guest_state)
return;
fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
...@@ -1345,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) ...@@ -1345,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
static void enter_rmode(struct kvm_vcpu *vcpu) static void enter_rmode(struct kvm_vcpu *vcpu)
{ {
unsigned long flags; unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
vmx->emulation_required = 1;
vcpu->arch.rmode.active = 1; vcpu->arch.rmode.active = 1;
vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
...@@ -1367,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu) ...@@ -1367,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
update_exception_bitmap(vcpu); update_exception_bitmap(vcpu);
if (emulate_invalid_guest_state)
goto continue_rmode;
vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
vmcs_write32(GUEST_SS_LIMIT, 0xffff); vmcs_write32(GUEST_SS_LIMIT, 0xffff);
vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
...@@ -1382,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) ...@@ -1382,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
continue_rmode:
kvm_mmu_reset_context(vcpu); kvm_mmu_reset_context(vcpu);
init_rmode(vcpu->kvm); init_rmode(vcpu->kvm);
} }
...@@ -1715,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) ...@@ -1715,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
vmcs_writel(GUEST_GDTR_BASE, dt->base); vmcs_writel(GUEST_GDTR_BASE, dt->base);
} }
static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_segment var;
u32 ar;
vmx_get_segment(vcpu, &var, seg);
ar = vmx_segment_access_rights(&var);
if (var.base != (var.selector << 4))
return false;
if (var.limit != 0xffff)
return false;
if (ar != 0xf3)
return false;
return true;
}
static bool code_segment_valid(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs;
unsigned int cs_rpl;
vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
cs_rpl = cs.selector & SELECTOR_RPL_MASK;
if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
return false;
if (!cs.s)
return false;
if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
if (cs.dpl > cs_rpl)
return false;
} else if (cs.type & AR_TYPE_CODE_MASK) {
if (cs.dpl != cs_rpl)
return false;
}
if (!cs.present)
return false;
/* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
return true;
}
static bool stack_segment_valid(struct kvm_vcpu *vcpu)
{
struct kvm_segment ss;
unsigned int ss_rpl;
vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
ss_rpl = ss.selector & SELECTOR_RPL_MASK;
if ((ss.type != 3) || (ss.type != 7))
return false;
if (!ss.s)
return false;
if (ss.dpl != ss_rpl) /* DPL != RPL */
return false;
if (!ss.present)
return false;
return true;
}
static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_segment var;
unsigned int rpl;
vmx_get_segment(vcpu, &var, seg);
rpl = var.selector & SELECTOR_RPL_MASK;
if (!var.s)
return false;
if (!var.present)
return false;
if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
if (var.dpl < rpl) /* DPL < RPL */
return false;
}
/* TODO: Add other members to kvm_segment_field to allow checking for other access
* rights flags
*/
return true;
}
static bool tr_valid(struct kvm_vcpu *vcpu)
{
struct kvm_segment tr;
vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
return false;
if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
return false;
if (!tr.present)
return false;
return true;
}
static bool ldtr_valid(struct kvm_vcpu *vcpu)
{
struct kvm_segment ldtr;
vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
return false;
if (ldtr.type != 2)
return false;
if (!ldtr.present)
return false;
return true;
}
static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs, ss;
vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
return ((cs.selector & SELECTOR_RPL_MASK) ==
(ss.selector & SELECTOR_RPL_MASK));
}
/*
* Check if guest state is valid. Returns true if valid, false if
* not.
* We assume that registers are always usable
*/
static bool guest_state_valid(struct kvm_vcpu *vcpu)
{
/* real mode guest state checks */
if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
return false;
if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
return false;
if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
return false;
if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
return false;
if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
return false;
if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
return false;
} else {
/* protected mode guest state checks */
if (!cs_ss_rpl_check(vcpu))
return false;
if (!code_segment_valid(vcpu))
return false;
if (!stack_segment_valid(vcpu))
return false;
if (!data_segment_valid(vcpu, VCPU_SREG_DS))
return false;
if (!data_segment_valid(vcpu, VCPU_SREG_ES))
return false;
if (!data_segment_valid(vcpu, VCPU_SREG_FS))
return false;
if (!data_segment_valid(vcpu, VCPU_SREG_GS))
return false;
if (!tr_valid(vcpu))
return false;
if (!ldtr_valid(vcpu))
return false;
}
/* TODO:
* - Add checks on RIP
* - Add checks on RFLAGS
*/
return true;
}
static int init_rmode_tss(struct kvm *kvm) static int init_rmode_tss(struct kvm *kvm)
{ {
gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
...@@ -1726,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm) ...@@ -1726,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm)
if (r < 0) if (r < 0)
goto out; goto out;
data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); r = kvm_write_guest_page(kvm, fn++, &data,
TSS_IOPB_BASE_OFFSET, sizeof(u16));
if (r < 0) if (r < 0)
goto out; goto out;
r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
...@@ -1789,7 +1994,7 @@ static void seg_setup(int seg) ...@@ -1789,7 +1994,7 @@ static void seg_setup(int seg)
vmcs_write16(sf->selector, 0); vmcs_write16(sf->selector, 0);
vmcs_writel(sf->base, 0); vmcs_writel(sf->base, 0);
vmcs_write32(sf->limit, 0xffff); vmcs_write32(sf->limit, 0xffff);
vmcs_write32(sf->ar_bytes, 0x93); vmcs_write32(sf->ar_bytes, 0xf3);
} }
static int alloc_apic_access_page(struct kvm *kvm) static int alloc_apic_access_page(struct kvm *kvm)
...@@ -1808,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm) ...@@ -1808,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
if (r) if (r)
goto out; goto out;
down_read(&current->mm->mmap_sem);
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
up_read(&current->mm->mmap_sem);
out: out:
up_write(&kvm->slots_lock); up_write(&kvm->slots_lock);
return r; return r;
...@@ -1832,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm) ...@@ -1832,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm)
if (r) if (r)
goto out; goto out;
down_read(&current->mm->mmap_sem);
kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
up_read(&current->mm->mmap_sem);
out: out:
up_write(&kvm->slots_lock); up_write(&kvm->slots_lock);
return r; return r;
...@@ -1917,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ...@@ -1917,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
} }
if (!vm_need_ept()) if (!vm_need_ept())
exec_control |= CPU_BASED_CR3_STORE_EXITING | exec_control |= CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_CR3_LOAD_EXITING; CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_INVLPG_EXITING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
if (cpu_has_secondary_exec_ctrls()) { if (cpu_has_secondary_exec_ctrls()) {
...@@ -2019,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) ...@@ -2019,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
u64 msr; u64 msr;
int ret; int ret;
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
down_read(&vcpu->kvm->slots_lock); down_read(&vcpu->kvm->slots_lock);
if (!init_rmode(vmx->vcpu.kvm)) { if (!init_rmode(vmx->vcpu.kvm)) {
ret = -ENOMEM; ret = -ENOMEM;
...@@ -2036,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) ...@@ -2036,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
fx_init(&vmx->vcpu); fx_init(&vmx->vcpu);
seg_setup(VCPU_SREG_CS);
/* /*
* GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
* insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
...@@ -2047,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) ...@@ -2047,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
} }
vmcs_write32(GUEST_CS_LIMIT, 0xffff);
vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
seg_setup(VCPU_SREG_DS); seg_setup(VCPU_SREG_DS);
seg_setup(VCPU_SREG_ES); seg_setup(VCPU_SREG_ES);
...@@ -2072,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) ...@@ -2072,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RFLAGS, 0x02); vmcs_writel(GUEST_RFLAGS, 0x02);
if (vmx->vcpu.vcpu_id == 0) if (vmx->vcpu.vcpu_id == 0)
vmcs_writel(GUEST_RIP, 0xfff0); kvm_rip_write(vcpu, 0xfff0);
else else
vmcs_writel(GUEST_RIP, 0); kvm_rip_write(vcpu, 0);
vmcs_writel(GUEST_RSP, 0); kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
/* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
vmcs_writel(GUEST_DR7, 0x400); vmcs_writel(GUEST_DR7, 0x400);
...@@ -2125,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) ...@@ -2125,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
ret = 0; ret = 0;
/* HACK: Don't enable emulation on guest boot/reset */
vmx->emulation_required = 0;
out: out:
up_read(&vcpu->kvm->slots_lock); up_read(&vcpu->kvm->slots_lock);
return ret; return ret;
...@@ -2136,14 +2341,15 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) ...@@ -2136,14 +2341,15 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
++vcpu->stat.irq_injections;
if (vcpu->arch.rmode.active) { if (vcpu->arch.rmode.active) {
vmx->rmode.irq.pending = true; vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = irq; vmx->rmode.irq.vector = irq;
vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); vmx->rmode.irq.rip = kvm_rip_read(vcpu);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
return; return;
} }
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
...@@ -2154,7 +2360,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) ...@@ -2154,7 +2360,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{ {
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
vcpu->arch.nmi_pending = 0;
} }
static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
...@@ -2166,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) ...@@ -2166,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
if (!vcpu->arch.irq_pending[word_index]) if (!vcpu->arch.irq_pending[word_index])
clear_bit(word_index, &vcpu->arch.irq_summary); clear_bit(word_index, &vcpu->arch.irq_summary);
vmx_inject_irq(vcpu, irq); kvm_queue_interrupt(vcpu, irq);
} }
...@@ -2180,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, ...@@ -2180,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
if (vcpu->arch.interrupt_window_open && if (vcpu->arch.interrupt_window_open &&
vcpu->arch.irq_summary && vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
!(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
/*
* If interrupts enabled, and not blocked by sti or mov ss. Good.
*/
kvm_do_inject_irq(vcpu); kvm_do_inject_irq(vcpu);
if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
if (!vcpu->arch.interrupt_window_open && if (!vcpu->arch.interrupt_window_open &&
(vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
...@@ -2237,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) ...@@ -2237,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
static int handle_rmode_exception(struct kvm_vcpu *vcpu, static int handle_rmode_exception(struct kvm_vcpu *vcpu,
int vec, u32 err_code) int vec, u32 err_code)
{ {
if (!vcpu->arch.rmode.active)
return 0;
/* /*
* Instruction with address size override prefix opcode 0x67 * Instruction with address size override prefix opcode 0x67
* Cause the #SS fault with 0 error code in VM86 mode. * Cause the #SS fault with 0 error code in VM86 mode.
...@@ -2247,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, ...@@ -2247,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
return 1; return 1;
/*
* Forward all other exceptions that are valid in real mode.
* FIXME: Breaks guest debugging in real mode, needs to be fixed with
* the required debugging infrastructure rework.
*/
switch (vec) {
case DE_VECTOR:
case DB_VECTOR:
case BP_VECTOR:
case OF_VECTOR:
case BR_VECTOR:
case UD_VECTOR:
case DF_VECTOR:
case SS_VECTOR:
case GP_VECTOR:
case MF_VECTOR:
kvm_queue_exception(vcpu, vec);
return 1;
}
return 0; return 0;
} }
...@@ -2288,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2288,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
} }
error_code = 0; error_code = 0;
rip = vmcs_readl(GUEST_RIP); rip = kvm_rip_read(vcpu);
if (intr_info & INTR_INFO_DELIVER_CODE_MASK) if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) { if (is_page_fault(intr_info)) {
...@@ -2298,7 +2518,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2298,7 +2518,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
cr2 = vmcs_readl(EXIT_QUALIFICATION); cr2 = vmcs_readl(EXIT_QUALIFICATION);
KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
(u32)((u64)cr2 >> 32), handler); (u32)((u64)cr2 >> 32), handler);
if (vect_info & VECTORING_INFO_VALID_MASK) if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
kvm_mmu_unprotect_page_virt(vcpu, cr2); kvm_mmu_unprotect_page_virt(vcpu, cr2);
return kvm_mmu_page_fault(vcpu, cr2, error_code); return kvm_mmu_page_fault(vcpu, cr2, error_code);
} }
...@@ -2386,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2386,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
reg = (exit_qualification >> 8) & 15; reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) { switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */ case 0: /* mov to cr */
KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
(u32)((u64)vcpu->arch.regs[reg] >> 32), handler); (u32)kvm_register_read(vcpu, reg),
(u32)((u64)kvm_register_read(vcpu, reg) >> 32),
handler);
switch (cr) { switch (cr) {
case 0: case 0:
vcpu_load_rsp_rip(vcpu); kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu); skip_emulated_instruction(vcpu);
return 1; return 1;
case 3: case 3:
vcpu_load_rsp_rip(vcpu); kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu); skip_emulated_instruction(vcpu);
return 1; return 1;
case 4: case 4:
vcpu_load_rsp_rip(vcpu); kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu); skip_emulated_instruction(vcpu);
return 1; return 1;
case 8: case 8:
vcpu_load_rsp_rip(vcpu); kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu); skip_emulated_instruction(vcpu);
if (irqchip_in_kernel(vcpu->kvm)) if (irqchip_in_kernel(vcpu->kvm))
return 1; return 1;
...@@ -2415,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2415,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}; };
break; break;
case 2: /* clts */ case 2: /* clts */
vcpu_load_rsp_rip(vcpu);
vmx_fpu_deactivate(vcpu); vmx_fpu_deactivate(vcpu);
vcpu->arch.cr0 &= ~X86_CR0_TS; vcpu->arch.cr0 &= ~X86_CR0_TS;
vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
...@@ -2426,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2426,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
case 1: /*mov from cr*/ case 1: /*mov from cr*/
switch (cr) { switch (cr) {
case 3: case 3:
vcpu_load_rsp_rip(vcpu); kvm_register_write(vcpu, reg, vcpu->arch.cr3);
vcpu->arch.regs[reg] = vcpu->arch.cr3;
vcpu_put_rsp_rip(vcpu);
KVMTRACE_3D(CR_READ, vcpu, (u32)cr, KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
(u32)vcpu->arch.regs[reg], (u32)kvm_register_read(vcpu, reg),
(u32)((u64)vcpu->arch.regs[reg] >> 32), (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
handler); handler);
skip_emulated_instruction(vcpu); skip_emulated_instruction(vcpu);
return 1; return 1;
case 8: case 8:
vcpu_load_rsp_rip(vcpu); kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
vcpu_put_rsp_rip(vcpu);
KVMTRACE_2D(CR_READ, vcpu, (u32)cr, KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
(u32)vcpu->arch.regs[reg], handler); (u32)kvm_register_read(vcpu, reg), handler);
skip_emulated_instruction(vcpu); skip_emulated_instruction(vcpu);
return 1; return 1;
} }
...@@ -2472,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2472,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
exit_qualification = vmcs_readl(EXIT_QUALIFICATION); exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
dr = exit_qualification & 7; dr = exit_qualification & 7;
reg = (exit_qualification >> 8) & 15; reg = (exit_qualification >> 8) & 15;
vcpu_load_rsp_rip(vcpu);
if (exit_qualification & 16) { if (exit_qualification & 16) {
/* mov from dr */ /* mov from dr */
switch (dr) { switch (dr) {
...@@ -2485,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2485,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
default: default:
val = 0; val = 0;
} }
vcpu->arch.regs[reg] = val; kvm_register_write(vcpu, reg, val);
KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
} else { } else {
/* mov to dr */ /* mov to dr */
} }
vcpu_put_rsp_rip(vcpu);
skip_emulated_instruction(vcpu); skip_emulated_instruction(vcpu);
return 1; return 1;
} }
...@@ -2583,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2583,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1; return 1;
} }
static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
kvm_mmu_invlpg(vcpu, exit_qualification);
skip_emulated_instruction(vcpu);
return 1;
}
static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{ {
skip_emulated_instruction(vcpu); skip_emulated_instruction(vcpu);
...@@ -2695,6 +2915,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2695,6 +2915,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1; return 1;
} }
static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int err;
preempt_enable();
local_irq_enable();
while (!guest_state_valid(vcpu)) {
err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
switch (err) {
case EMULATE_DONE:
break;
case EMULATE_DO_MMIO:
kvm_report_emulation_failure(vcpu, "mmio");
/* TODO: Handle MMIO */
return;
default:
kvm_report_emulation_failure(vcpu, "emulation failure");
return;
}
if (signal_pending(current))
break;
if (need_resched())
schedule();
}
local_irq_disable();
preempt_disable();
/* Guest state should be valid now, no more emulation should be needed */
vmx->emulation_required = 0;
}
/* /*
* The exit handlers return 1 if the exit was handled fully and guest execution * The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs * may resume. Otherwise they set the kvm_run parameter to indicate what needs
...@@ -2714,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, ...@@ -2714,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
[EXIT_REASON_MSR_WRITE] = handle_wrmsr, [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt, [EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
...@@ -2735,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) ...@@ -2735,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vectoring_info = vmx->idt_vectoring_info; u32 vectoring_info = vmx->idt_vectoring_info;
KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
(u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
/* Access CR3 don't cause VMExit in paging mode, so we need /* Access CR3 don't cause VMExit in paging mode, so we need
* to sync with guest real CR3. */ * to sync with guest real CR3. */
...@@ -2829,88 +3087,92 @@ static void enable_intr_window(struct kvm_vcpu *vcpu) ...@@ -2829,88 +3087,92 @@ static void enable_intr_window(struct kvm_vcpu *vcpu)
enable_irq_window(vcpu); enable_irq_window(vcpu);
} }
static void vmx_intr_assist(struct kvm_vcpu *vcpu) static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_intr_info;
u32 idtv_info_field, intr_info_field, exit_intr_info_field; u32 idt_vectoring_info;
int vector; bool unblock_nmi;
u8 vector;
update_tpr_threshold(vcpu); int type;
bool idtv_info_valid;
intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); u32 error;
exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO);
idtv_info_field = vmx->idt_vectoring_info;
if (intr_info_field & INTR_INFO_VALID_MASK) {
if (idtv_info_field & INTR_INFO_VALID_MASK) {
/* TODO: fault when IDT_Vectoring */
if (printk_ratelimit())
printk(KERN_ERR "Fault when IDT_Vectoring\n");
}
enable_intr_window(vcpu);
return;
}
if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
== INTR_TYPE_EXT_INTR
&& vcpu->arch.rmode.active) {
u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
vmx_inject_irq(vcpu, vect); exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
enable_intr_window(vcpu); if (cpu_has_virtual_nmis()) {
return; unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
/*
* SDM 3: 25.7.1.2
* Re-set bit "block by NMI" before VM entry if vmexit caused by
* a guest IRET fault.
*/
if (unblock_nmi && vector != DF_VECTOR)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
} }
KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); idt_vectoring_info = vmx->idt_vectoring_info;
idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
if (vmx->vcpu.arch.nmi_injected) {
/* /*
* SDM 3: 25.7.1.2 * SDM 3: 25.7.1.2
* Clear bit "block by NMI" before VM entry if a NMI delivery * Clear bit "block by NMI" before VM entry if a NMI delivery
* faulted. * faulted.
*/ */
if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
== INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis()) vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & else
~GUEST_INTR_STATE_NMI); vmx->vcpu.arch.nmi_injected = false;
}
kvm_clear_exception_queue(&vmx->vcpu);
if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
kvm_queue_exception_e(&vmx->vcpu, vector, error);
} else
kvm_queue_exception(&vmx->vcpu, vector);
vmx->idt_vectoring_info = 0;
}
kvm_clear_interrupt_queue(&vmx->vcpu);
if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
kvm_queue_interrupt(&vmx->vcpu, vector);
vmx->idt_vectoring_info = 0;
}
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field static void vmx_intr_assist(struct kvm_vcpu *vcpu)
& ~INTR_INFO_RESVD_BITS_MASK); {
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, update_tpr_threshold(vcpu);
vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) if (cpu_has_virtual_nmis()) {
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
vmcs_read32(IDT_VECTORING_ERROR_CODE)); if (vmx_nmi_enabled(vcpu)) {
vcpu->arch.nmi_pending = false;
vcpu->arch.nmi_injected = true;
} else {
enable_intr_window(vcpu); enable_intr_window(vcpu);
return; return;
} }
if (cpu_has_virtual_nmis()) { }
/* if (vcpu->arch.nmi_injected) {
* SDM 3: 25.7.1.2
* Re-set bit "block by NMI" before VM entry if vmexit caused by
* a guest IRET fault.
*/
if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) &&
(exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) |
GUEST_INTR_STATE_NMI);
else if (vcpu->arch.nmi_pending) {
if (vmx_nmi_enabled(vcpu))
vmx_inject_nmi(vcpu); vmx_inject_nmi(vcpu);
enable_intr_window(vcpu); enable_intr_window(vcpu);
return; return;
} }
} }
if (!kvm_cpu_has_interrupt(vcpu)) if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
return; if (vmx_irq_enabled(vcpu))
if (vmx_irq_enabled(vcpu)) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
vector = kvm_cpu_get_interrupt(vcpu); else
vmx_inject_irq(vcpu, vector);
kvm_timer_intr_post(vcpu, vector);
} else
enable_irq_window(vcpu); enable_irq_window(vcpu);
}
if (vcpu->arch.interrupt.pending) {
vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
}
} }
/* /*
...@@ -2922,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) ...@@ -2922,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
static void fixup_rmode_irq(struct vcpu_vmx *vmx) static void fixup_rmode_irq(struct vcpu_vmx *vmx)
{ {
vmx->rmode.irq.pending = 0; vmx->rmode.irq.pending = 0;
if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
return; return;
vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
...@@ -2936,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) ...@@ -2936,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
| vmx->rmode.irq.vector; | vmx->rmode.irq.vector;
} }
#ifdef CONFIG_X86_64
#define R "r"
#define Q "q"
#else
#define R "e"
#define Q "l"
#endif
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info; u32 intr_info;
/* Handle invalid guest state instead of entering VMX */
if (vmx->emulation_required && emulate_invalid_guest_state) {
handle_invalid_guest_state(vcpu, kvm_run);
return;
}
if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
/* /*
* Loading guest fpu may have cleared host cr0.ts * Loading guest fpu may have cleared host cr0.ts
*/ */
...@@ -2948,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2948,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
asm( asm(
/* Store host registers */ /* Store host registers */
#ifdef CONFIG_X86_64 "push %%"R"dx; push %%"R"bp;"
"push %%rdx; push %%rbp;" "push %%"R"cx \n\t"
"push %%rcx \n\t" "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
#else "je 1f \n\t"
"push %%edx; push %%ebp;" "mov %%"R"sp, %c[host_rsp](%0) \n\t"
"push %%ecx \n\t"
#endif
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
/* Check if vmlaunch of vmresume is needed */ /* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t" "cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */ /* Load guest registers. Don't clobber flags. */
"mov %c[cr2](%0), %%"R"ax \n\t"
"mov %%"R"ax, %%cr2 \n\t"
"mov %c[rax](%0), %%"R"ax \n\t"
"mov %c[rbx](%0), %%"R"bx \n\t"
"mov %c[rdx](%0), %%"R"dx \n\t"
"mov %c[rsi](%0), %%"R"si \n\t"
"mov %c[rdi](%0), %%"R"di \n\t"
"mov %c[rbp](%0), %%"R"bp \n\t"
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
"mov %c[cr2](%0), %%rax \n\t"
"mov %%rax, %%cr2 \n\t"
"mov %c[rax](%0), %%rax \n\t"
"mov %c[rbx](%0), %%rbx \n\t"
"mov %c[rdx](%0), %%rdx \n\t"
"mov %c[rsi](%0), %%rsi \n\t"
"mov %c[rdi](%0), %%rdi \n\t"
"mov %c[rbp](%0), %%rbp \n\t"
"mov %c[r8](%0), %%r8 \n\t" "mov %c[r8](%0), %%r8 \n\t"
"mov %c[r9](%0), %%r9 \n\t" "mov %c[r9](%0), %%r9 \n\t"
"mov %c[r10](%0), %%r10 \n\t" "mov %c[r10](%0), %%r10 \n\t"
...@@ -2976,18 +3256,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2976,18 +3256,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %c[r13](%0), %%r13 \n\t" "mov %c[r13](%0), %%r13 \n\t"
"mov %c[r14](%0), %%r14 \n\t" "mov %c[r14](%0), %%r14 \n\t"
"mov %c[r15](%0), %%r15 \n\t" "mov %c[r15](%0), %%r15 \n\t"
"mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
#else
"mov %c[cr2](%0), %%eax \n\t"
"mov %%eax, %%cr2 \n\t"
"mov %c[rax](%0), %%eax \n\t"
"mov %c[rbx](%0), %%ebx \n\t"
"mov %c[rdx](%0), %%edx \n\t"
"mov %c[rsi](%0), %%esi \n\t"
"mov %c[rdi](%0), %%edi \n\t"
"mov %c[rbp](%0), %%ebp \n\t"
"mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
#endif #endif
"mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
/* Enter guest mode */ /* Enter guest mode */
"jne .Llaunched \n\t" "jne .Llaunched \n\t"
__ex(ASM_VMX_VMLAUNCH) "\n\t" __ex(ASM_VMX_VMLAUNCH) "\n\t"
...@@ -2995,15 +3266,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2995,15 +3266,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
".Lkvm_vmx_return: " ".Lkvm_vmx_return: "
/* Save guest registers, load host registers, keep flags */ /* Save guest registers, load host registers, keep flags */
"xchg %0, (%%"R"sp) \n\t"
"mov %%"R"ax, %c[rax](%0) \n\t"
"mov %%"R"bx, %c[rbx](%0) \n\t"
"push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
"mov %%"R"dx, %c[rdx](%0) \n\t"
"mov %%"R"si, %c[rsi](%0) \n\t"
"mov %%"R"di, %c[rdi](%0) \n\t"
"mov %%"R"bp, %c[rbp](%0) \n\t"
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
"xchg %0, (%%rsp) \n\t"
"mov %%rax, %c[rax](%0) \n\t"
"mov %%rbx, %c[rbx](%0) \n\t"
"pushq (%%rsp); popq %c[rcx](%0) \n\t"
"mov %%rdx, %c[rdx](%0) \n\t"
"mov %%rsi, %c[rsi](%0) \n\t"
"mov %%rdi, %c[rdi](%0) \n\t"
"mov %%rbp, %c[rbp](%0) \n\t"
"mov %%r8, %c[r8](%0) \n\t" "mov %%r8, %c[r8](%0) \n\t"
"mov %%r9, %c[r9](%0) \n\t" "mov %%r9, %c[r9](%0) \n\t"
"mov %%r10, %c[r10](%0) \n\t" "mov %%r10, %c[r10](%0) \n\t"
...@@ -3012,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -3012,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %%r13, %c[r13](%0) \n\t" "mov %%r13, %c[r13](%0) \n\t"
"mov %%r14, %c[r14](%0) \n\t" "mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t"
"mov %%cr2, %%rax \n\t"
"mov %%rax, %c[cr2](%0) \n\t"
"pop %%rbp; pop %%rbp; pop %%rdx \n\t"
#else
"xchg %0, (%%esp) \n\t"
"mov %%eax, %c[rax](%0) \n\t"
"mov %%ebx, %c[rbx](%0) \n\t"
"pushl (%%esp); popl %c[rcx](%0) \n\t"
"mov %%edx, %c[rdx](%0) \n\t"
"mov %%esi, %c[rsi](%0) \n\t"
"mov %%edi, %c[rdi](%0) \n\t"
"mov %%ebp, %c[rbp](%0) \n\t"
"mov %%cr2, %%eax \n\t"
"mov %%eax, %c[cr2](%0) \n\t"
"pop %%ebp; pop %%ebp; pop %%edx \n\t"
#endif #endif
"mov %%cr2, %%"R"ax \n\t"
"mov %%"R"ax, %c[cr2](%0) \n\t"
"pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t"
"setbe %c[fail](%0) \n\t" "setbe %c[fail](%0) \n\t"
: : "c"(vmx), "d"((unsigned long)HOST_RSP), : : "c"(vmx), "d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, launched)), [launched]"i"(offsetof(struct vcpu_vmx, launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)), [fail]"i"(offsetof(struct vcpu_vmx, fail)),
[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
...@@ -3053,14 +3312,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -3053,14 +3312,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
#endif #endif
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
: "cc", "memory" : "cc", "memory"
, R"bx", R"di", R"si"
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
, "rbx", "rdi", "rsi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#else
, "ebx", "edi", "rsi"
#endif #endif
); );
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
vcpu->arch.regs_dirty = 0;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if (vmx->rmode.irq.pending) if (vmx->rmode.irq.pending)
fixup_rmode_irq(vmx); fixup_rmode_irq(vmx);
...@@ -3080,8 +3340,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -3080,8 +3340,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
KVMTRACE_0D(NMI, vcpu, handler); KVMTRACE_0D(NMI, vcpu, handler);
asm("int $2"); asm("int $2");
} }
vmx_complete_interrupts(vmx);
} }
#undef R
#undef Q
static void vmx_free_vmcs(struct kvm_vcpu *vcpu) static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu);
...@@ -3224,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = { ...@@ -3224,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_idt = vmx_set_idt, .set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt, .get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt, .set_gdt = vmx_set_gdt,
.cache_regs = vcpu_load_rsp_rip, .cache_reg = vmx_cache_reg,
.decache_regs = vcpu_put_rsp_rip,
.get_rflags = vmx_get_rflags, .get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags, .set_rflags = vmx_set_rflags,
......
...@@ -331,9 +331,6 @@ enum vmcs_field { ...@@ -331,9 +331,6 @@ enum vmcs_field {
#define AR_RESERVD_MASK 0xfffe0f00 #define AR_RESERVD_MASK 0xfffe0f00
#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10
......
...@@ -4,10 +4,14 @@ ...@@ -4,10 +4,14 @@
* derived from drivers/kvm/kvm_main.c * derived from drivers/kvm/kvm_main.c
* *
* Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2008 Qumranet, Inc.
* Copyright IBM Corporation, 2008
* *
* Authors: * Authors:
* Avi Kivity <avi@qumranet.com> * Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com>
* Amit Shah <amit.shah@qumranet.com>
* Ben-Ami Yassour <benami@il.ibm.com>
* *
* This work is licensed under the terms of the GNU GPL, version 2. See * This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory. * the COPYING file in the top-level directory.
...@@ -19,14 +23,18 @@ ...@@ -19,14 +23,18 @@
#include "mmu.h" #include "mmu.h"
#include "i8254.h" #include "i8254.h"
#include "tss.h" #include "tss.h"
#include "kvm_cache_regs.h"
#include "x86.h"
#include <linux/clocksource.h> #include <linux/clocksource.h>
#include <linux/interrupt.h>
#include <linux/kvm.h> #include <linux/kvm.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/mman.h> #include <linux/mman.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/intel-iommu.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/msr.h> #include <asm/msr.h>
...@@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, ...@@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries); struct kvm_cpuid_entry2 __user *entries);
struct kvm_x86_ops *kvm_x86_ops; struct kvm_x86_ops *kvm_x86_ops;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
struct kvm_stats_debugfs_item debugfs_entries[] = { struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) }, { "pf_fixed", VCPU_STAT(pf_fixed) },
...@@ -83,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { ...@@ -83,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "fpu_reload", VCPU_STAT(fpu_reload) }, { "fpu_reload", VCPU_STAT(fpu_reload) },
{ "insn_emulation", VCPU_STAT(insn_emulation) }, { "insn_emulation", VCPU_STAT(insn_emulation) },
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
{ "irq_injections", VCPU_STAT(irq_injections) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
...@@ -90,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { ...@@ -90,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_flooded", VM_STAT(mmu_flooded) }, { "mmu_flooded", VM_STAT(mmu_flooded) },
{ "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) }, { "largepages", VM_STAT(lpages) },
{ NULL } { NULL }
}; };
unsigned long segment_base(u16 selector) unsigned long segment_base(u16 selector)
{ {
struct descriptor_table gdt; struct descriptor_table gdt;
...@@ -352,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); ...@@ -352,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{ {
if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu); kvm_mmu_flush_tlb(vcpu);
return; return;
} }
...@@ -662,6 +673,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) ...@@ -662,6 +673,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
__func__, data); __func__, data);
break; break;
case MSR_IA32_DEBUGCTLMSR:
if (!data) {
/* We support the non-activated case already */
break;
} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
/* Values other than LBR and BTF are vendor-specific,
thus reserved and should throw a #GP */
return 1;
}
pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
__func__, data);
break;
case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE: case MSR_IA32_UCODE_WRITE:
break; break;
...@@ -692,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) ...@@ -692,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* ...but clean it before doing the actual write */ /* ...but clean it before doing the actual write */
vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
down_read(&current->mm->mmap_sem);
vcpu->arch.time_page = vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
up_read(&current->mm->mmap_sem);
if (is_error_page(vcpu->arch.time_page)) { if (is_error_page(vcpu->arch.time_page)) {
kvm_release_page_clean(vcpu->arch.time_page); kvm_release_page_clean(vcpu->arch.time_page);
...@@ -752,8 +773,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) ...@@ -752,8 +773,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MC0_MISC+8: case MSR_IA32_MC0_MISC+8:
case MSR_IA32_MC0_MISC+12: case MSR_IA32_MC0_MISC+12:
case MSR_IA32_MC0_MISC+16: case MSR_IA32_MC0_MISC+16:
case MSR_IA32_MC0_MISC+20:
case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_REV:
case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_EBL_CR_POWERON:
case MSR_IA32_DEBUGCTLMSR:
case MSR_IA32_LASTBRANCHFROMIP:
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
data = 0; data = 0;
break; break;
case MSR_MTRRcap: case MSR_MTRRcap:
...@@ -901,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext) ...@@ -901,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PV_MMU: case KVM_CAP_PV_MMU:
r = !tdp_enabled; r = !tdp_enabled;
break; break;
case KVM_CAP_IOMMU:
r = intel_iommu_found();
break;
default: default:
r = 0; r = 0;
break; break;
...@@ -1303,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp, ...@@ -1303,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_vcpu *vcpu = filp->private_data; struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg; void __user *argp = (void __user *)arg;
int r; int r;
struct kvm_lapic_state *lapic = NULL;
switch (ioctl) { switch (ioctl) {
case KVM_GET_LAPIC: { case KVM_GET_LAPIC: {
struct kvm_lapic_state lapic; lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
memset(&lapic, 0, sizeof lapic); r = -ENOMEM;
r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); if (!lapic)
goto out;
r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
if (r) if (r)
goto out; goto out;
r = -EFAULT; r = -EFAULT;
if (copy_to_user(argp, &lapic, sizeof lapic)) if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
goto out; goto out;
r = 0; r = 0;
break; break;
} }
case KVM_SET_LAPIC: { case KVM_SET_LAPIC: {
struct kvm_lapic_state lapic; lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
if (!lapic)
goto out;
r = -EFAULT; r = -EFAULT;
if (copy_from_user(&lapic, argp, sizeof lapic)) if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
goto out; goto out;
r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
if (r) if (r)
goto out; goto out;
r = 0; r = 0;
...@@ -1422,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, ...@@ -1422,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL; r = -EINVAL;
} }
out: out:
if (lapic)
kfree(lapic);
return r; return r;
} }
...@@ -1630,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp, ...@@ -1630,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
struct kvm *kvm = filp->private_data; struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg; void __user *argp = (void __user *)arg;
int r = -EINVAL; int r = -EINVAL;
/*
* This union makes it completely explicit to gcc-3.x
* that these two variables' stack usage should be
* combined, not added together.
*/
union {
struct kvm_pit_state ps;
struct kvm_memory_alias alias;
} u;
switch (ioctl) { switch (ioctl) {
case KVM_SET_TSS_ADDR: case KVM_SET_TSS_ADDR:
...@@ -1661,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp, ...@@ -1661,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_GET_NR_MMU_PAGES: case KVM_GET_NR_MMU_PAGES:
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
break; break;
case KVM_SET_MEMORY_ALIAS: { case KVM_SET_MEMORY_ALIAS:
struct kvm_memory_alias alias;
r = -EFAULT; r = -EFAULT;
if (copy_from_user(&alias, argp, sizeof alias)) if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
goto out; goto out;
r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
if (r) if (r)
goto out; goto out;
break; break;
}
case KVM_CREATE_IRQCHIP: case KVM_CREATE_IRQCHIP:
r = -ENOMEM; r = -ENOMEM;
kvm->arch.vpic = kvm_create_pic(kvm); kvm->arch.vpic = kvm_create_pic(kvm);
...@@ -1699,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp, ...@@ -1699,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out; goto out;
if (irqchip_in_kernel(kvm)) { if (irqchip_in_kernel(kvm)) {
mutex_lock(&kvm->lock); mutex_lock(&kvm->lock);
if (irq_event.irq < 16) kvm_set_irq(kvm, irq_event.irq, irq_event.level);
kvm_pic_set_irq(pic_irqchip(kvm),
irq_event.irq,
irq_event.level);
kvm_ioapic_set_irq(kvm->arch.vioapic,
irq_event.irq,
irq_event.level);
mutex_unlock(&kvm->lock); mutex_unlock(&kvm->lock);
r = 0; r = 0;
} }
...@@ -1713,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp, ...@@ -1713,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp,
} }
case KVM_GET_IRQCHIP: { case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip chip; struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
r = -EFAULT; r = -ENOMEM;
if (copy_from_user(&chip, argp, sizeof chip)) if (!chip)
goto out; goto out;
r = -EFAULT;
if (copy_from_user(chip, argp, sizeof *chip))
goto get_irqchip_out;
r = -ENXIO; r = -ENXIO;
if (!irqchip_in_kernel(kvm)) if (!irqchip_in_kernel(kvm))
goto out; goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, &chip); r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r) if (r)
goto out; goto get_irqchip_out;
r = -EFAULT; r = -EFAULT;
if (copy_to_user(argp, &chip, sizeof chip)) if (copy_to_user(argp, chip, sizeof *chip))
goto out; goto get_irqchip_out;
r = 0; r = 0;
get_irqchip_out:
kfree(chip);
if (r)
goto out;
break; break;
} }
case KVM_SET_IRQCHIP: { case KVM_SET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip chip; struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
r = -EFAULT; r = -ENOMEM;
if (copy_from_user(&chip, argp, sizeof chip)) if (!chip)
goto out; goto out;
r = -EFAULT;
if (copy_from_user(chip, argp, sizeof *chip))
goto set_irqchip_out;
r = -ENXIO; r = -ENXIO;
if (!irqchip_in_kernel(kvm)) if (!irqchip_in_kernel(kvm))
goto out; goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, &chip); r = kvm_vm_ioctl_set_irqchip(kvm, chip);
if (r) if (r)
goto out; goto set_irqchip_out;
r = 0; r = 0;
set_irqchip_out:
kfree(chip);
if (r)
goto out;
break; break;
} }
case KVM_GET_PIT: { case KVM_GET_PIT: {
struct kvm_pit_state ps;
r = -EFAULT; r = -EFAULT;
if (copy_from_user(&ps, argp, sizeof ps)) if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
goto out; goto out;
r = -ENXIO; r = -ENXIO;
if (!kvm->arch.vpit) if (!kvm->arch.vpit)
goto out; goto out;
r = kvm_vm_ioctl_get_pit(kvm, &ps); r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
if (r) if (r)
goto out; goto out;
r = -EFAULT; r = -EFAULT;
if (copy_to_user(argp, &ps, sizeof ps)) if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
goto out; goto out;
r = 0; r = 0;
break; break;
} }
case KVM_SET_PIT: { case KVM_SET_PIT: {
struct kvm_pit_state ps;
r = -EFAULT; r = -EFAULT;
if (copy_from_user(&ps, argp, sizeof ps)) if (copy_from_user(&u.ps, argp, sizeof u.ps))
goto out; goto out;
r = -ENXIO; r = -ENXIO;
if (!kvm->arch.vpit) if (!kvm->arch.vpit)
goto out; goto out;
r = kvm_vm_ioctl_set_pit(kvm, &ps); r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
if (r) if (r)
goto out; goto out;
r = 0; r = 0;
...@@ -2018,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, ...@@ -2018,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
val = *(u64 *)new; val = *(u64 *)new;
down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
up_read(&current->mm->mmap_sem);
kaddr = kmap_atomic(page, KM_USER0); kaddr = kmap_atomic(page, KM_USER0);
set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
...@@ -2040,6 +2087,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) ...@@ -2040,6 +2087,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
{ {
kvm_mmu_invlpg(vcpu, address);
return X86EMUL_CONTINUE; return X86EMUL_CONTINUE;
} }
...@@ -2080,7 +2128,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) ...@@ -2080,7 +2128,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
{ {
u8 opcodes[4]; u8 opcodes[4];
unsigned long rip = vcpu->arch.rip; unsigned long rip = kvm_rip_read(vcpu);
unsigned long rip_linear; unsigned long rip_linear;
if (!printk_ratelimit()) if (!printk_ratelimit())
...@@ -2102,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = { ...@@ -2102,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = {
.cmpxchg_emulated = emulator_cmpxchg_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated,
}; };
static void cache_all_regs(struct kvm_vcpu *vcpu)
{
kvm_register_read(vcpu, VCPU_REGS_RAX);
kvm_register_read(vcpu, VCPU_REGS_RSP);
kvm_register_read(vcpu, VCPU_REGS_RIP);
vcpu->arch.regs_dirty = ~0;
}
int emulate_instruction(struct kvm_vcpu *vcpu, int emulate_instruction(struct kvm_vcpu *vcpu,
struct kvm_run *run, struct kvm_run *run,
unsigned long cr2, unsigned long cr2,
...@@ -2111,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ...@@ -2111,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
int r; int r;
struct decode_cache *c; struct decode_cache *c;
kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2; vcpu->arch.mmio_fault_cr2 = cr2;
kvm_x86_ops->cache_regs(vcpu); /*
* TODO: fix x86_emulate.c to use guest_read/write_register
* instead of direct ->regs accesses, can save hundred cycles
* on Intel for instructions that don't read/change RSP, for
* for example.
*/
cache_all_regs(vcpu);
vcpu->mmio_is_write = 0; vcpu->mmio_is_write = 0;
vcpu->arch.pio.string = 0; vcpu->arch.pio.string = 0;
...@@ -2172,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ...@@ -2172,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DO_MMIO; return EMULATE_DO_MMIO;
} }
kvm_x86_ops->decache_regs(vcpu);
kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
if (vcpu->mmio_is_write) { if (vcpu->mmio_is_write) {
...@@ -2225,21 +2287,20 @@ int complete_pio(struct kvm_vcpu *vcpu) ...@@ -2225,21 +2287,20 @@ int complete_pio(struct kvm_vcpu *vcpu)
struct kvm_pio_request *io = &vcpu->arch.pio; struct kvm_pio_request *io = &vcpu->arch.pio;
long delta; long delta;
int r; int r;
unsigned long val;
kvm_x86_ops->cache_regs(vcpu);
if (!io->string) { if (!io->string) {
if (io->in) if (io->in) {
memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, val = kvm_register_read(vcpu, VCPU_REGS_RAX);
io->size); memcpy(&val, vcpu->arch.pio_data, io->size);
kvm_register_write(vcpu, VCPU_REGS_RAX, val);
}
} else { } else {
if (io->in) { if (io->in) {
r = pio_copy_data(vcpu); r = pio_copy_data(vcpu);
if (r) { if (r)
kvm_x86_ops->cache_regs(vcpu);
return r; return r;
} }
}
delta = 1; delta = 1;
if (io->rep) { if (io->rep) {
...@@ -2248,18 +2309,23 @@ int complete_pio(struct kvm_vcpu *vcpu) ...@@ -2248,18 +2309,23 @@ int complete_pio(struct kvm_vcpu *vcpu)
* The size of the register should really depend on * The size of the register should really depend on
* current address size. * current address size.
*/ */
vcpu->arch.regs[VCPU_REGS_RCX] -= delta; val = kvm_register_read(vcpu, VCPU_REGS_RCX);
val -= delta;
kvm_register_write(vcpu, VCPU_REGS_RCX, val);
} }
if (io->down) if (io->down)
delta = -delta; delta = -delta;
delta *= io->size; delta *= io->size;
if (io->in) if (io->in) {
vcpu->arch.regs[VCPU_REGS_RDI] += delta; val = kvm_register_read(vcpu, VCPU_REGS_RDI);
else val += delta;
vcpu->arch.regs[VCPU_REGS_RSI] += delta; kvm_register_write(vcpu, VCPU_REGS_RDI, val);
} else {
val = kvm_register_read(vcpu, VCPU_REGS_RSI);
val += delta;
kvm_register_write(vcpu, VCPU_REGS_RSI, val);
}
} }
kvm_x86_ops->decache_regs(vcpu);
io->count -= io->cur_count; io->count -= io->cur_count;
io->cur_count = 0; io->cur_count = 0;
...@@ -2313,6 +2379,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, ...@@ -2313,6 +2379,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int size, unsigned port) int size, unsigned port)
{ {
struct kvm_io_device *pio_dev; struct kvm_io_device *pio_dev;
unsigned long val;
vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
...@@ -2333,8 +2400,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, ...@@ -2333,8 +2400,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
handler); handler);
kvm_x86_ops->cache_regs(vcpu); val = kvm_register_read(vcpu, VCPU_REGS_RAX);
memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); memcpy(vcpu->arch.pio_data, &val, 4);
kvm_x86_ops->skip_emulated_instruction(vcpu); kvm_x86_ops->skip_emulated_instruction(vcpu);
...@@ -2492,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) ...@@ -2492,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
KVMTRACE_0D(HLT, vcpu, handler); KVMTRACE_0D(HLT, vcpu, handler);
if (irqchip_in_kernel(vcpu->kvm)) { if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED; vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
up_read(&vcpu->kvm->slots_lock);
kvm_vcpu_block(vcpu);
down_read(&vcpu->kvm->slots_lock);
if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
return -EINTR;
return 1; return 1;
} else { } else {
vcpu->run->exit_reason = KVM_EXIT_HLT; vcpu->run->exit_reason = KVM_EXIT_HLT;
...@@ -2519,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) ...@@ -2519,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
unsigned long nr, a0, a1, a2, a3, ret; unsigned long nr, a0, a1, a2, a3, ret;
int r = 1; int r = 1;
kvm_x86_ops->cache_regs(vcpu); nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
nr = vcpu->arch.regs[VCPU_REGS_RAX]; a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
a0 = vcpu->arch.regs[VCPU_REGS_RBX]; a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
a1 = vcpu->arch.regs[VCPU_REGS_RCX]; a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
a2 = vcpu->arch.regs[VCPU_REGS_RDX];
a3 = vcpu->arch.regs[VCPU_REGS_RSI];
KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
...@@ -2548,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) ...@@ -2548,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
ret = -KVM_ENOSYS; ret = -KVM_ENOSYS;
break; break;
} }
vcpu->arch.regs[VCPU_REGS_RAX] = ret; kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
kvm_x86_ops->decache_regs(vcpu);
++vcpu->stat.hypercalls; ++vcpu->stat.hypercalls;
return r; return r;
} }
...@@ -2559,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) ...@@ -2559,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
{ {
char instruction[3]; char instruction[3];
int ret = 0; int ret = 0;
unsigned long rip = kvm_rip_read(vcpu);
/* /*
...@@ -2568,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) ...@@ -2568,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
*/ */
kvm_mmu_zap_all(vcpu->kvm); kvm_mmu_zap_all(vcpu->kvm);
kvm_x86_ops->cache_regs(vcpu);
kvm_x86_ops->patch_hypercall(vcpu, instruction); kvm_x86_ops->patch_hypercall(vcpu, instruction);
if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) if (emulator_write_emulated(rip, instruction, 3, vcpu)
!= X86EMUL_CONTINUE) != X86EMUL_CONTINUE)
ret = -EFAULT; ret = -EFAULT;
...@@ -2700,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) ...@@ -2700,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
u32 function, index; u32 function, index;
struct kvm_cpuid_entry2 *e, *best; struct kvm_cpuid_entry2 *e, *best;
kvm_x86_ops->cache_regs(vcpu); function = kvm_register_read(vcpu, VCPU_REGS_RAX);
function = vcpu->arch.regs[VCPU_REGS_RAX]; index = kvm_register_read(vcpu, VCPU_REGS_RCX);
index = vcpu->arch.regs[VCPU_REGS_RCX]; kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
vcpu->arch.regs[VCPU_REGS_RAX] = 0; kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
vcpu->arch.regs[VCPU_REGS_RBX] = 0; kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
vcpu->arch.regs[VCPU_REGS_RCX] = 0; kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
vcpu->arch.regs[VCPU_REGS_RDX] = 0;
best = NULL; best = NULL;
for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
e = &vcpu->arch.cpuid_entries[i]; e = &vcpu->arch.cpuid_entries[i];
...@@ -2724,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) ...@@ -2724,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
best = e; best = e;
} }
if (best) { if (best) {
vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
} }
kvm_x86_ops->decache_regs(vcpu);
kvm_x86_ops->skip_emulated_instruction(vcpu); kvm_x86_ops->skip_emulated_instruction(vcpu);
KVMTRACE_5D(CPUID, vcpu, function, KVMTRACE_5D(CPUID, vcpu, function,
(u32)vcpu->arch.regs[VCPU_REGS_RAX], (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
(u32)vcpu->arch.regs[VCPU_REGS_RBX], (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
(u32)vcpu->arch.regs[VCPU_REGS_RCX], (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
(u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
} }
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
...@@ -2776,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu) ...@@ -2776,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
if (!apic || !apic->vapic_addr) if (!apic || !apic->vapic_addr)
return; return;
down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
up_read(&current->mm->mmap_sem);
vcpu->arch.apic->vapic_page = page; vcpu->arch.apic->vapic_page = page;
} }
...@@ -2796,28 +2851,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu) ...@@ -2796,28 +2851,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
up_read(&vcpu->kvm->slots_lock); up_read(&vcpu->kvm->slots_lock);
} }
static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{ {
int r; int r;
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
pr_debug("vcpu %d received sipi with vector # %x\n",
vcpu->vcpu_id, vcpu->arch.sipi_vector);
kvm_lapic_reset(vcpu);
r = kvm_x86_ops->vcpu_reset(vcpu);
if (r)
return r;
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
down_read(&vcpu->kvm->slots_lock);
vapic_enter(vcpu);
preempted:
if (vcpu->guest_debug.enabled)
kvm_x86_ops->guest_debug_pre(vcpu);
again:
if (vcpu->requests) if (vcpu->requests)
if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
kvm_mmu_unload(vcpu); kvm_mmu_unload(vcpu);
...@@ -2829,6 +2866,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2829,6 +2866,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vcpu->requests) { if (vcpu->requests) {
if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu); __kvm_migrate_timers(vcpu);
if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
kvm_mmu_sync_roots(vcpu);
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
kvm_x86_ops->tlb_flush(vcpu); kvm_x86_ops->tlb_flush(vcpu);
if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
...@@ -2854,21 +2893,15 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2854,21 +2893,15 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
local_irq_disable(); local_irq_disable();
if (vcpu->requests || need_resched()) { if (vcpu->requests || need_resched() || signal_pending(current)) {
local_irq_enable(); local_irq_enable();
preempt_enable(); preempt_enable();
r = 1; r = 1;
goto out; goto out;
} }
if (signal_pending(current)) { if (vcpu->guest_debug.enabled)
local_irq_enable(); kvm_x86_ops->guest_debug_pre(vcpu);
preempt_enable();
r = -EINTR;
kvm_run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
goto out;
}
vcpu->guest_mode = 1; vcpu->guest_mode = 1;
/* /*
...@@ -2917,8 +2950,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2917,8 +2950,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
* Profile KVM exit RIPs: * Profile KVM exit RIPs:
*/ */
if (unlikely(prof_on == KVM_PROFILING)) { if (unlikely(prof_on == KVM_PROFILING)) {
kvm_x86_ops->cache_regs(vcpu); unsigned long rip = kvm_rip_read(vcpu);
profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); profile_hit(KVM_PROFILING, (void *)rip);
} }
if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
...@@ -2927,26 +2960,63 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2927,26 +2960,63 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_lapic_sync_from_vapic(vcpu); kvm_lapic_sync_from_vapic(vcpu);
r = kvm_x86_ops->handle_exit(kvm_run, vcpu); r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
out:
return r;
}
static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
pr_debug("vcpu %d received sipi with vector # %x\n",
vcpu->vcpu_id, vcpu->arch.sipi_vector);
kvm_lapic_reset(vcpu);
r = kvm_x86_ops->vcpu_reset(vcpu);
if (r)
return r;
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
down_read(&vcpu->kvm->slots_lock);
vapic_enter(vcpu);
r = 1;
while (r > 0) {
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
r = vcpu_enter_guest(vcpu, kvm_run);
else {
up_read(&vcpu->kvm->slots_lock);
kvm_vcpu_block(vcpu);
down_read(&vcpu->kvm->slots_lock);
if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
r = -EINTR;
}
if (r > 0) { if (r > 0) {
if (dm_request_for_irq_injection(vcpu, kvm_run)) { if (dm_request_for_irq_injection(vcpu, kvm_run)) {
r = -EINTR; r = -EINTR;
kvm_run->exit_reason = KVM_EXIT_INTR; kvm_run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.request_irq_exits; ++vcpu->stat.request_irq_exits;
goto out;
} }
if (!need_resched()) if (signal_pending(current)) {
goto again; r = -EINTR;
kvm_run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
} }
if (need_resched()) {
out:
up_read(&vcpu->kvm->slots_lock); up_read(&vcpu->kvm->slots_lock);
if (r > 0) {
kvm_resched(vcpu); kvm_resched(vcpu);
down_read(&vcpu->kvm->slots_lock); down_read(&vcpu->kvm->slots_lock);
goto preempted; }
}
} }
up_read(&vcpu->kvm->slots_lock);
post_kvm_run_save(vcpu, kvm_run); post_kvm_run_save(vcpu, kvm_run);
vapic_exit(vcpu); vapic_exit(vcpu);
...@@ -2966,6 +3036,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2966,6 +3036,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
kvm_vcpu_block(vcpu); kvm_vcpu_block(vcpu);
clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
r = -EAGAIN; r = -EAGAIN;
goto out; goto out;
} }
...@@ -2999,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2999,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
} }
} }
#endif #endif
if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
kvm_x86_ops->cache_regs(vcpu); kvm_register_write(vcpu, VCPU_REGS_RAX,
vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; kvm_run->hypercall.ret);
kvm_x86_ops->decache_regs(vcpu);
}
r = __vcpu_run(vcpu, kvm_run); r = __vcpu_run(vcpu, kvm_run);
...@@ -3019,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) ...@@ -3019,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{ {
vcpu_load(vcpu); vcpu_load(vcpu);
kvm_x86_ops->cache_regs(vcpu); regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
#endif #endif
regs->rip = vcpu->arch.rip; regs->rip = kvm_rip_read(vcpu);
regs->rflags = kvm_x86_ops->get_rflags(vcpu); regs->rflags = kvm_x86_ops->get_rflags(vcpu);
/* /*
...@@ -3058,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) ...@@ -3058,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{ {
vcpu_load(vcpu); vcpu_load(vcpu);
vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
#endif #endif
vcpu->arch.rip = regs->rip; kvm_rip_write(vcpu, regs->rip);
kvm_x86_ops->set_rflags(vcpu, regs->rflags); kvm_x86_ops->set_rflags(vcpu, regs->rflags);
kvm_x86_ops->decache_regs(vcpu);
vcpu->arch.exception.pending = false; vcpu->arch.exception.pending = false;
...@@ -3294,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, ...@@ -3294,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
return 0; return 0;
} }
static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
{
struct kvm_segment segvar = {
.base = selector << 4,
.limit = 0xffff,
.selector = selector,
.type = 3,
.present = 1,
.dpl = 3,
.db = 0,
.s = 1,
.l = 0,
.g = 0,
.avl = 0,
.unusable = 0,
};
kvm_x86_ops->set_segment(vcpu, &segvar, seg);
return 0;
}
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
int type_bits, int seg) int type_bits, int seg)
{ {
struct kvm_segment kvm_seg; struct kvm_segment kvm_seg;
if (!(vcpu->arch.cr0 & X86_CR0_PE))
return kvm_load_realmode_segment(vcpu, selector, seg);
if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
return 1; return 1;
kvm_seg.type |= type_bits; kvm_seg.type |= type_bits;
...@@ -3316,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, ...@@ -3316,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
struct tss_segment_32 *tss) struct tss_segment_32 *tss)
{ {
tss->cr3 = vcpu->arch.cr3; tss->cr3 = vcpu->arch.cr3;
tss->eip = vcpu->arch.rip; tss->eip = kvm_rip_read(vcpu);
tss->eflags = kvm_x86_ops->get_rflags(vcpu); tss->eflags = kvm_x86_ops->get_rflags(vcpu);
tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
...@@ -3342,17 +3430,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, ...@@ -3342,17 +3430,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
{ {
kvm_set_cr3(vcpu, tss->cr3); kvm_set_cr3(vcpu, tss->cr3);
vcpu->arch.rip = tss->eip; kvm_rip_write(vcpu, tss->eip);
kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
return 1; return 1;
...@@ -3380,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, ...@@ -3380,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
static void save_state_to_tss16(struct kvm_vcpu *vcpu, static void save_state_to_tss16(struct kvm_vcpu *vcpu,
struct tss_segment_16 *tss) struct tss_segment_16 *tss)
{ {
tss->ip = vcpu->arch.rip; tss->ip = kvm_rip_read(vcpu);
tss->flag = kvm_x86_ops->get_rflags(vcpu); tss->flag = kvm_x86_ops->get_rflags(vcpu);
tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
...@@ -3402,16 +3490,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu, ...@@ -3402,16 +3490,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
static int load_state_from_tss16(struct kvm_vcpu *vcpu, static int load_state_from_tss16(struct kvm_vcpu *vcpu,
struct tss_segment_16 *tss) struct tss_segment_16 *tss)
{ {
vcpu->arch.rip = tss->ip; kvm_rip_write(vcpu, tss->ip);
kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
return 1; return 1;
...@@ -3534,7 +3622,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) ...@@ -3534,7 +3622,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
} }
kvm_x86_ops->skip_emulated_instruction(vcpu); kvm_x86_ops->skip_emulated_instruction(vcpu);
kvm_x86_ops->cache_regs(vcpu);
if (nseg_desc.type & 8) if (nseg_desc.type & 8)
ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
...@@ -3559,7 +3646,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) ...@@ -3559,7 +3646,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
tr_seg.type = 11; tr_seg.type = 11;
kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
out: out:
kvm_x86_ops->decache_regs(vcpu);
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(kvm_task_switch); EXPORT_SYMBOL_GPL(kvm_task_switch);
...@@ -3622,6 +3708,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, ...@@ -3622,6 +3708,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
pr_debug("Set back pending irq %d\n", pr_debug("Set back pending irq %d\n",
pending_vec); pending_vec);
} }
kvm_pic_clear_isr_ack(vcpu->kvm);
} }
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
...@@ -3634,6 +3721,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, ...@@ -3634,6 +3721,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
/* Older userspace won't unhalt the vcpu on reset. */
if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
!(vcpu->arch.cr0 & X86_CR0_PE))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
vcpu_put(vcpu); vcpu_put(vcpu);
return 0; return 0;
...@@ -3918,6 +4011,7 @@ struct kvm *kvm_arch_create_vm(void) ...@@ -3918,6 +4011,7 @@ struct kvm *kvm_arch_create_vm(void)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
return kvm; return kvm;
} }
...@@ -3950,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm) ...@@ -3950,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm)
{ {
kvm_iommu_unmap_guest(kvm);
kvm_free_all_assigned_devices(kvm);
kvm_free_pit(kvm); kvm_free_pit(kvm);
kfree(kvm->arch.vpic); kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic); kfree(kvm->arch.vioapic);
...@@ -3981,7 +4077,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, ...@@ -3981,7 +4077,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
userspace_addr = do_mmap(NULL, 0, userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE, npages * PAGE_SIZE,
PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, MAP_PRIVATE | MAP_ANONYMOUS,
0); 0);
up_write(&current->mm->mmap_sem); up_write(&current->mm->mmap_sem);
......
#ifndef ARCH_X86_KVM_X86_H
#define ARCH_X86_KVM_X86_H
#include <linux/kvm_host.h>
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.exception.pending = false;
}
static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
{
vcpu->arch.interrupt.pending = true;
vcpu->arch.interrupt.nr = vector;
}
static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.interrupt.pending = false;
}
#endif
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#define DPRINTF(_f, _a ...) printf(_f , ## _a) #define DPRINTF(_f, _a ...) printf(_f , ## _a)
#else #else
#include <linux/kvm_host.h> #include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
#define DPRINTF(x...) do {} while (0) #define DPRINTF(x...) do {} while (0)
#endif #endif
#include <linux/module.h> #include <linux/module.h>
...@@ -46,25 +47,26 @@ ...@@ -46,25 +47,26 @@
#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
#define DstReg (2<<1) /* Register operand. */ #define DstReg (2<<1) /* Register operand. */
#define DstMem (3<<1) /* Memory operand. */ #define DstMem (3<<1) /* Memory operand. */
#define DstMask (3<<1) #define DstAcc (4<<1) /* Destination Accumulator */
#define DstMask (7<<1)
/* Source operand type. */ /* Source operand type. */
#define SrcNone (0<<3) /* No source operand. */ #define SrcNone (0<<4) /* No source operand. */
#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ #define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
#define SrcReg (1<<3) /* Register operand. */ #define SrcReg (1<<4) /* Register operand. */
#define SrcMem (2<<3) /* Memory operand. */ #define SrcMem (2<<4) /* Memory operand. */
#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ #define SrcMem16 (3<<4) /* Memory operand (16-bit). */
#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ #define SrcMem32 (4<<4) /* Memory operand (32-bit). */
#define SrcImm (5<<3) /* Immediate operand. */ #define SrcImm (5<<4) /* Immediate operand. */
#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
#define SrcMask (7<<3) #define SrcMask (7<<4)
/* Generic ModRM decode. */ /* Generic ModRM decode. */
#define ModRM (1<<6) #define ModRM (1<<7)
/* Destination is only written; never read. */ /* Destination is only written; never read. */
#define Mov (1<<7) #define Mov (1<<8)
#define BitOp (1<<8) #define BitOp (1<<9)
#define MemAbs (1<<9) /* Memory operand is absolute displacement */ #define MemAbs (1<<10) /* Memory operand is absolute displacement */
#define String (1<<10) /* String instruction (rep capable) */ #define String (1<<12) /* String instruction (rep capable) */
#define Stack (1<<11) /* Stack instruction (push/pop) */ #define Stack (1<<13) /* Stack instruction (push/pop) */
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
#define GroupMask 0xff /* Group number stored in bits 0:7 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */
...@@ -94,7 +96,7 @@ static u16 opcode_table[256] = { ...@@ -94,7 +96,7 @@ static u16 opcode_table[256] = {
/* 0x20 - 0x27 */ /* 0x20 - 0x27 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
SrcImmByte, SrcImm, 0, 0, DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
/* 0x28 - 0x2F */ /* 0x28 - 0x2F */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
...@@ -106,7 +108,8 @@ static u16 opcode_table[256] = { ...@@ -106,7 +108,8 @@ static u16 opcode_table[256] = {
/* 0x38 - 0x3F */ /* 0x38 - 0x3F */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
0, 0, 0, 0, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
0, 0,
/* 0x40 - 0x47 */ /* 0x40 - 0x47 */
DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
/* 0x48 - 0x4F */ /* 0x48 - 0x4F */
...@@ -153,9 +156,16 @@ static u16 opcode_table[256] = { ...@@ -153,9 +156,16 @@ static u16 opcode_table[256] = {
0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
ByteOp | ImplicitOps | String, ImplicitOps | String, ByteOp | ImplicitOps | String, ImplicitOps | String,
/* 0xB0 - 0xBF */ /* 0xB0 - 0xB7 */
0, 0, 0, 0, 0, 0, 0, 0, ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0, ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
/* 0xB8 - 0xBF */
DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
/* 0xC0 - 0xC7 */ /* 0xC0 - 0xC7 */
ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
0, ImplicitOps | Stack, 0, 0, 0, ImplicitOps | Stack, 0, 0,
...@@ -169,17 +179,20 @@ static u16 opcode_table[256] = { ...@@ -169,17 +179,20 @@ static u16 opcode_table[256] = {
/* 0xD8 - 0xDF */ /* 0xD8 - 0xDF */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xE0 - 0xE7 */ /* 0xE0 - 0xE7 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xE8 - 0xEF */ /* 0xE8 - 0xEF */
ImplicitOps | Stack, SrcImm | ImplicitOps, ImplicitOps | Stack, SrcImm | ImplicitOps,
ImplicitOps, SrcImmByte | ImplicitOps, ImplicitOps, SrcImmByte | ImplicitOps,
0, 0, 0, 0, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xF0 - 0xF7 */ /* 0xF0 - 0xF7 */
0, 0, 0, 0, 0, 0, 0, 0,
ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
/* 0xF8 - 0xFF */ /* 0xF8 - 0xFF */
ImplicitOps, 0, ImplicitOps, ImplicitOps, ImplicitOps, 0, ImplicitOps, ImplicitOps,
0, 0, Group | Group4, Group | Group5, ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
}; };
static u16 twobyte_table[256] = { static u16 twobyte_table[256] = {
...@@ -268,15 +281,16 @@ static u16 group_table[] = { ...@@ -268,15 +281,16 @@ static u16 group_table[] = {
ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
0, 0, 0, 0, 0, 0, 0, 0,
[Group3*8] = [Group3*8] =
DstMem | SrcImm | ModRM | SrcImm, 0, DstMem | SrcImm | ModRM, 0,
DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
0, 0, 0, 0, 0, 0, 0, 0,
[Group4*8] = [Group4*8] =
ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
[Group5*8] = [Group5*8] =
DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
[Group7*8] = [Group7*8] =
0, 0, ModRM | SrcMem, ModRM | SrcMem, 0, 0, ModRM | SrcMem, ModRM | SrcMem,
SrcNone | ModRM | DstMem | Mov, 0, SrcNone | ModRM | DstMem | Mov, 0,
...@@ -839,7 +853,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -839,7 +853,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* Shadow copy of register state. Committed on successful emulation. */ /* Shadow copy of register state. Committed on successful emulation. */
memset(c, 0, sizeof(struct decode_cache)); memset(c, 0, sizeof(struct decode_cache));
c->eip = ctxt->vcpu->arch.rip; c->eip = kvm_rip_read(ctxt->vcpu);
ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
...@@ -1048,6 +1062,23 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1048,6 +1062,23 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
} }
c->dst.type = OP_MEM; c->dst.type = OP_MEM;
break; break;
case DstAcc:
c->dst.type = OP_REG;
c->dst.bytes = c->op_bytes;
c->dst.ptr = &c->regs[VCPU_REGS_RAX];
switch (c->op_bytes) {
case 1:
c->dst.val = *(u8 *)c->dst.ptr;
break;
case 2:
c->dst.val = *(u16 *)c->dst.ptr;
break;
case 4:
c->dst.val = *(u32 *)c->dst.ptr;
break;
}
c->dst.orig_val = c->dst.val;
break;
} }
if (c->rip_relative) if (c->rip_relative)
...@@ -1151,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, ...@@ -1151,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
case 1: /* dec */ case 1: /* dec */
emulate_1op("dec", c->dst, ctxt->eflags); emulate_1op("dec", c->dst, ctxt->eflags);
break; break;
case 2: /* call near abs */ {
long int old_eip;
old_eip = c->eip;
c->eip = c->src.val;
c->src.val = old_eip;
emulate_push(ctxt);
break;
}
case 4: /* jmp abs */ case 4: /* jmp abs */
c->eip = c->src.val; c->eip = c->src.val;
break; break;
...@@ -1251,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1251,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
u64 msr_data; u64 msr_data;
unsigned long saved_eip = 0; unsigned long saved_eip = 0;
struct decode_cache *c = &ctxt->decode; struct decode_cache *c = &ctxt->decode;
unsigned int port;
int io_dir_in;
int rc = 0; int rc = 0;
/* Shadow copy of register state. Committed on successful emulation. /* Shadow copy of register state. Committed on successful emulation.
...@@ -1267,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1267,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
if (c->rep_prefix && (c->d & String)) { if (c->rep_prefix && (c->d & String)) {
/* All REP prefixes have the same first termination condition */ /* All REP prefixes have the same first termination condition */
if (c->regs[VCPU_REGS_RCX] == 0) { if (c->regs[VCPU_REGS_RCX] == 0) {
ctxt->vcpu->arch.rip = c->eip; kvm_rip_write(ctxt->vcpu, c->eip);
goto done; goto done;
} }
/* The second termination condition only applies for REPE /* The second termination condition only applies for REPE
...@@ -1281,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1281,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
(c->b == 0xae) || (c->b == 0xaf)) { (c->b == 0xae) || (c->b == 0xaf)) {
if ((c->rep_prefix == REPE_PREFIX) && if ((c->rep_prefix == REPE_PREFIX) &&
((ctxt->eflags & EFLG_ZF) == 0)) { ((ctxt->eflags & EFLG_ZF) == 0)) {
ctxt->vcpu->arch.rip = c->eip; kvm_rip_write(ctxt->vcpu, c->eip);
goto done; goto done;
} }
if ((c->rep_prefix == REPNE_PREFIX) && if ((c->rep_prefix == REPNE_PREFIX) &&
((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
ctxt->vcpu->arch.rip = c->eip; kvm_rip_write(ctxt->vcpu, c->eip);
goto done; goto done;
} }
} }
c->regs[VCPU_REGS_RCX]--; c->regs[VCPU_REGS_RCX]--;
c->eip = ctxt->vcpu->arch.rip; c->eip = kvm_rip_read(ctxt->vcpu);
} }
if (c->src.type == OP_MEM) { if (c->src.type == OP_MEM) {
...@@ -1351,27 +1392,10 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1351,27 +1392,10 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
sbb: /* sbb */ sbb: /* sbb */
emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
break; break;
case 0x20 ... 0x23: case 0x20 ... 0x25:
and: /* and */ and: /* and */
emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
break; break;
case 0x24: /* and al imm8 */
c->dst.type = OP_REG;
c->dst.ptr = &c->regs[VCPU_REGS_RAX];
c->dst.val = *(u8 *)c->dst.ptr;
c->dst.bytes = 1;
c->dst.orig_val = c->dst.val;
goto and;
case 0x25: /* and ax imm16, or eax imm32 */
c->dst.type = OP_REG;
c->dst.bytes = c->op_bytes;
c->dst.ptr = &c->regs[VCPU_REGS_RAX];
if (c->op_bytes == 2)
c->dst.val = *(u16 *)c->dst.ptr;
else
c->dst.val = *(u32 *)c->dst.ptr;
c->dst.orig_val = c->dst.val;
goto and;
case 0x28 ... 0x2d: case 0x28 ... 0x2d:
sub: /* sub */ sub: /* sub */
emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
...@@ -1659,7 +1683,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1659,7 +1683,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
case 0xae ... 0xaf: /* scas */ case 0xae ... 0xaf: /* scas */
DPRINTF("Urk! I don't handle SCAS.\n"); DPRINTF("Urk! I don't handle SCAS.\n");
goto cannot_emulate; goto cannot_emulate;
case 0xb8: /* mov r, imm */ case 0xb0 ... 0xbf: /* mov r, imm */
goto mov; goto mov;
case 0xc0 ... 0xc1: case 0xc0 ... 0xc1:
emulate_grp2(ctxt); emulate_grp2(ctxt);
...@@ -1679,6 +1703,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1679,6 +1703,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
c->src.val = c->regs[VCPU_REGS_RCX]; c->src.val = c->regs[VCPU_REGS_RCX];
emulate_grp2(ctxt); emulate_grp2(ctxt);
break; break;
case 0xe4: /* inb */
case 0xe5: /* in */
port = insn_fetch(u8, 1, c->eip);
io_dir_in = 1;
goto do_io;
case 0xe6: /* outb */
case 0xe7: /* out */
port = insn_fetch(u8, 1, c->eip);
io_dir_in = 0;
goto do_io;
case 0xe8: /* call (near) */ { case 0xe8: /* call (near) */ {
long int rel; long int rel;
switch (c->op_bytes) { switch (c->op_bytes) {
...@@ -1729,6 +1763,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1729,6 +1763,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
jmp_rel(c, c->src.val); jmp_rel(c, c->src.val);
c->dst.type = OP_NONE; /* Disable writeback. */ c->dst.type = OP_NONE; /* Disable writeback. */
break; break;
case 0xec: /* in al,dx */
case 0xed: /* in (e/r)ax,dx */
port = c->regs[VCPU_REGS_RDX];
io_dir_in = 1;
goto do_io;
case 0xee: /* out al,dx */
case 0xef: /* out (e/r)ax,dx */
port = c->regs[VCPU_REGS_RDX];
io_dir_in = 0;
do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
(c->d & ByteOp) ? 1 : c->op_bytes,
port) != 0) {
c->eip = saved_eip;
goto cannot_emulate;
}
return 0;
case 0xf4: /* hlt */ case 0xf4: /* hlt */
ctxt->vcpu->arch.halt_request = 1; ctxt->vcpu->arch.halt_request = 1;
break; break;
...@@ -1754,6 +1804,14 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1754,6 +1804,14 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
ctxt->eflags |= X86_EFLAGS_IF; ctxt->eflags |= X86_EFLAGS_IF;
c->dst.type = OP_NONE; /* Disable writeback. */ c->dst.type = OP_NONE; /* Disable writeback. */
break; break;
case 0xfc: /* cld */
ctxt->eflags &= ~EFLG_DF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfd: /* std */
ctxt->eflags |= EFLG_DF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfe ... 0xff: /* Grp4/Grp5 */ case 0xfe ... 0xff: /* Grp4/Grp5 */
rc = emulate_grp45(ctxt, ops); rc = emulate_grp45(ctxt, ops);
if (rc != 0) if (rc != 0)
...@@ -1768,7 +1826,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1768,7 +1826,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* Commit shadow register state. */ /* Commit shadow register state. */
memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
ctxt->vcpu->arch.rip = c->eip; kvm_rip_write(ctxt->vcpu, c->eip);
done: done:
if (rc == X86EMUL_UNHANDLEABLE) { if (rc == X86EMUL_UNHANDLEABLE) {
...@@ -1793,7 +1851,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1793,7 +1851,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
goto done; goto done;
/* Let the processor re-execute the fixed hypercall */ /* Let the processor re-execute the fixed hypercall */
c->eip = ctxt->vcpu->arch.rip; c->eip = kvm_rip_read(ctxt->vcpu);
/* Disable writeback. */ /* Disable writeback. */
c->dst.type = OP_NONE; c->dst.type = OP_NONE;
break; break;
...@@ -1889,7 +1947,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1889,7 +1947,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
if (rc) { if (rc) {
kvm_inject_gp(ctxt->vcpu, 0); kvm_inject_gp(ctxt->vcpu, 0);
c->eip = ctxt->vcpu->arch.rip; c->eip = kvm_rip_read(ctxt->vcpu);
} }
rc = X86EMUL_CONTINUE; rc = X86EMUL_CONTINUE;
c->dst.type = OP_NONE; c->dst.type = OP_NONE;
...@@ -1899,7 +1957,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1899,7 +1957,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
if (rc) { if (rc) {
kvm_inject_gp(ctxt->vcpu, 0); kvm_inject_gp(ctxt->vcpu, 0);
c->eip = ctxt->vcpu->arch.rip; c->eip = kvm_rip_read(ctxt->vcpu);
} else { } else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data; c->regs[VCPU_REGS_RAX] = (u32)msr_data;
c->regs[VCPU_REGS_RDX] = msr_data >> 32; c->regs[VCPU_REGS_RDX] = msr_data >> 32;
......
...@@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void) ...@@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void)
/* Get the TSC speed from Xen */ /* Get the TSC speed from Xen */
unsigned long xen_tsc_khz(void) unsigned long xen_tsc_khz(void)
{ {
u64 xen_khz = 1000000ULL << 32; struct pvclock_vcpu_time_info *info =
const struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time; &HYPERVISOR_shared_info->vcpu_info[0].time;
do_div(xen_khz, info->tsc_to_system_mul); return pvclock_tsc_khz(info);
if (info->tsc_shift < 0)
xen_khz <<= -info->tsc_shift;
else
xen_khz >>= info->tsc_shift;
return xen_khz;
} }
cycle_t xen_clocksource_read(void) cycle_t xen_clocksource_read(void)
......
...@@ -28,9 +28,9 @@ ...@@ -28,9 +28,9 @@
#include <linux/pci.h> #include <linux/pci.h>
#include <linux/dmar.h> #include <linux/dmar.h>
#include <linux/iova.h>
#include <linux/intel-iommu.h>
#include <linux/timer.h> #include <linux/timer.h>
#include "iova.h"
#include "intel-iommu.h"
#undef PREFIX #undef PREFIX
#define PREFIX "DMAR:" #define PREFIX "DMAR:"
......
...@@ -33,8 +33,8 @@ ...@@ -33,8 +33,8 @@
#include <linux/dma-mapping.h> #include <linux/dma-mapping.h>
#include <linux/mempool.h> #include <linux/mempool.h>
#include <linux/timer.h> #include <linux/timer.h>
#include "iova.h" #include <linux/iova.h>
#include "intel-iommu.h" #include <linux/intel-iommu.h>
#include <asm/proto.h> /* force_iommu in this header in x86-64*/ #include <asm/proto.h> /* force_iommu in this header in x86-64*/
#include <asm/cacheflush.h> #include <asm/cacheflush.h>
#include <asm/iommu.h> #include <asm/iommu.h>
...@@ -156,7 +156,7 @@ static inline void *alloc_domain_mem(void) ...@@ -156,7 +156,7 @@ static inline void *alloc_domain_mem(void)
return iommu_kmem_cache_alloc(iommu_domain_cache); return iommu_kmem_cache_alloc(iommu_domain_cache);
} }
static inline void free_domain_mem(void *vaddr) static void free_domain_mem(void *vaddr)
{ {
kmem_cache_free(iommu_domain_cache, vaddr); kmem_cache_free(iommu_domain_cache, vaddr);
} }
...@@ -1341,7 +1341,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain) ...@@ -1341,7 +1341,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
* find_domain * find_domain
* Note: we use struct pci_dev->dev.archdata.iommu stores the info * Note: we use struct pci_dev->dev.archdata.iommu stores the info
*/ */
struct dmar_domain * static struct dmar_domain *
find_domain(struct pci_dev *pdev) find_domain(struct pci_dev *pdev)
{ {
struct device_domain_info *info; struct device_domain_info *info;
...@@ -2318,3 +2318,111 @@ int __init intel_iommu_init(void) ...@@ -2318,3 +2318,111 @@ int __init intel_iommu_init(void)
return 0; return 0;
} }
void intel_iommu_domain_exit(struct dmar_domain *domain)
{
u64 end;
/* Domain 0 is reserved, so dont process it */
if (!domain)
return;
end = DOMAIN_MAX_ADDR(domain->gaw);
end = end & (~PAGE_MASK_4K);
/* clear ptes */
dma_pte_clear_range(domain, 0, end);
/* free page tables */
dma_pte_free_pagetable(domain, 0, end);
iommu_free_domain(domain);
free_domain_mem(domain);
}
EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
{
struct dmar_drhd_unit *drhd;
struct dmar_domain *domain;
struct intel_iommu *iommu;
drhd = dmar_find_matched_drhd_unit(pdev);
if (!drhd) {
printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
return NULL;
}
iommu = drhd->iommu;
if (!iommu) {
printk(KERN_ERR
"intel_iommu_domain_alloc: iommu == NULL\n");
return NULL;
}
domain = iommu_alloc_domain(iommu);
if (!domain) {
printk(KERN_ERR
"intel_iommu_domain_alloc: domain == NULL\n");
return NULL;
}
if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
printk(KERN_ERR
"intel_iommu_domain_alloc: domain_init() failed\n");
intel_iommu_domain_exit(domain);
return NULL;
}
return domain;
}
EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
int intel_iommu_context_mapping(
struct dmar_domain *domain, struct pci_dev *pdev)
{
int rc;
rc = domain_context_mapping(domain, pdev);
return rc;
}
EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
int intel_iommu_page_mapping(
struct dmar_domain *domain, dma_addr_t iova,
u64 hpa, size_t size, int prot)
{
int rc;
rc = domain_page_mapping(domain, iova, hpa, size, prot);
return rc;
}
EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
{
detach_domain_for_dev(domain, bus, devfn);
}
EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
struct dmar_domain *
intel_iommu_find_domain(struct pci_dev *pdev)
{
return find_domain(pdev);
}
EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
int intel_iommu_found(void)
{
return g_num_of_iommus;
}
EXPORT_SYMBOL_GPL(intel_iommu_found);
u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
{
struct dma_pte *pte;
u64 pfn;
pfn = 0;
pte = addr_to_dma_pte(domain, iova);
if (pte)
pfn = dma_pte_addr(*pte);
return pfn >> PAGE_SHIFT_4K;
}
EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#include <linux/pci.h> #include <linux/pci.h>
#include <linux/irq.h> #include <linux/irq.h>
#include <asm/io_apic.h> #include <asm/io_apic.h>
#include "intel-iommu.h" #include <linux/intel-iommu.h>
#include "intr_remapping.h" #include "intr_remapping.h"
static struct ioapic_scope ir_ioapic[MAX_IO_APICS]; static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
......
#include "intel-iommu.h" #include <linux/intel-iommu.h>
struct ioapic_scope { struct ioapic_scope {
struct intel_iommu *iommu; struct intel_iommu *iommu;
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
*/ */
#include "iova.h" #include <linux/iova.h>
void void
init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit) init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
......
...@@ -208,26 +208,4 @@ struct kvm_pit_channel_state { ...@@ -208,26 +208,4 @@ struct kvm_pit_channel_state {
struct kvm_pit_state { struct kvm_pit_state {
struct kvm_pit_channel_state channels[3]; struct kvm_pit_channel_state channels[3];
}; };
#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02)
#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03)
#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04)
#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05)
#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06)
#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07)
#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08)
#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09)
#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A)
#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B)
#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C)
#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D)
#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E)
#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F)
#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10)
#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11)
#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12)
#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13)
#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14)
#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15)
#endif /* ASM_X86__KVM_H */ #endif /* ASM_X86__KVM_H */
...@@ -57,6 +57,10 @@ ...@@ -57,6 +57,10 @@
#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) #define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
#define DE_VECTOR 0 #define DE_VECTOR 0
#define DB_VECTOR 1
#define BP_VECTOR 3
#define OF_VECTOR 4
#define BR_VECTOR 5
#define UD_VECTOR 6 #define UD_VECTOR 6
#define NM_VECTOR 7 #define NM_VECTOR 7
#define DF_VECTOR 8 #define DF_VECTOR 8
...@@ -65,6 +69,7 @@ ...@@ -65,6 +69,7 @@
#define SS_VECTOR 12 #define SS_VECTOR 12
#define GP_VECTOR 13 #define GP_VECTOR 13
#define PF_VECTOR 14 #define PF_VECTOR 14
#define MF_VECTOR 16
#define MC_VECTOR 18 #define MC_VECTOR 18
#define SELECTOR_TI_MASK (1 << 2) #define SELECTOR_TI_MASK (1 << 2)
...@@ -89,7 +94,7 @@ extern struct list_head vm_list; ...@@ -89,7 +94,7 @@ extern struct list_head vm_list;
struct kvm_vcpu; struct kvm_vcpu;
struct kvm; struct kvm;
enum { enum kvm_reg {
VCPU_REGS_RAX = 0, VCPU_REGS_RAX = 0,
VCPU_REGS_RCX = 1, VCPU_REGS_RCX = 1,
VCPU_REGS_RDX = 2, VCPU_REGS_RDX = 2,
...@@ -108,6 +113,7 @@ enum { ...@@ -108,6 +113,7 @@ enum {
VCPU_REGS_R14 = 14, VCPU_REGS_R14 = 14,
VCPU_REGS_R15 = 15, VCPU_REGS_R15 = 15,
#endif #endif
VCPU_REGS_RIP,
NR_VCPU_REGS NR_VCPU_REGS
}; };
...@@ -189,10 +195,20 @@ struct kvm_mmu_page { ...@@ -189,10 +195,20 @@ struct kvm_mmu_page {
*/ */
int multimapped; /* More than one parent_pte? */ int multimapped; /* More than one parent_pte? */
int root_count; /* Currently serving as active root */ int root_count; /* Currently serving as active root */
bool unsync;
bool unsync_children;
union { union {
u64 *parent_pte; /* !multimapped */ u64 *parent_pte; /* !multimapped */
struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
}; };
DECLARE_BITMAP(unsync_child_bitmap, 512);
};
struct kvm_pv_mmu_op_buffer {
void *ptr;
unsigned len;
unsigned processed;
char buf[512] __aligned(sizeof(long));
}; };
/* /*
...@@ -207,6 +223,9 @@ struct kvm_mmu { ...@@ -207,6 +223,9 @@ struct kvm_mmu {
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
void (*prefetch_page)(struct kvm_vcpu *vcpu, void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page); struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
hpa_t root_hpa; hpa_t root_hpa;
int root_level; int root_level;
int shadow_root_level; int shadow_root_level;
...@@ -219,8 +238,13 @@ struct kvm_vcpu_arch { ...@@ -219,8 +238,13 @@ struct kvm_vcpu_arch {
int interrupt_window_open; int interrupt_window_open;
unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ /*
unsigned long rip; /* needs vcpu_load_rsp_rip() */ * rip and regs accesses must go through
* kvm_{register,rip}_{read,write} functions.
*/
unsigned long regs[NR_VCPU_REGS];
u32 regs_avail;
u32 regs_dirty;
unsigned long cr0; unsigned long cr0;
unsigned long cr2; unsigned long cr2;
...@@ -237,6 +261,9 @@ struct kvm_vcpu_arch { ...@@ -237,6 +261,9 @@ struct kvm_vcpu_arch {
bool tpr_access_reporting; bool tpr_access_reporting;
struct kvm_mmu mmu; struct kvm_mmu mmu;
/* only needed in kvm_pv_mmu_op() path, but it's hot so
* put it here to avoid allocation */
struct kvm_pv_mmu_op_buffer mmu_op_buffer;
struct kvm_mmu_memory_cache mmu_pte_chain_cache; struct kvm_mmu_memory_cache mmu_pte_chain_cache;
struct kvm_mmu_memory_cache mmu_rmap_desc_cache; struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
...@@ -269,6 +296,11 @@ struct kvm_vcpu_arch { ...@@ -269,6 +296,11 @@ struct kvm_vcpu_arch {
u32 error_code; u32 error_code;
} exception; } exception;
struct kvm_queued_interrupt {
bool pending;
u8 nr;
} interrupt;
struct { struct {
int active; int active;
u8 save_iopl; u8 save_iopl;
...@@ -294,6 +326,7 @@ struct kvm_vcpu_arch { ...@@ -294,6 +326,7 @@ struct kvm_vcpu_arch {
struct page *time_page; struct page *time_page;
bool nmi_pending; bool nmi_pending;
bool nmi_injected;
u64 mtrr[0x100]; u64 mtrr[0x100];
}; };
...@@ -316,9 +349,12 @@ struct kvm_arch{ ...@@ -316,9 +349,12 @@ struct kvm_arch{
* Hash table of struct kvm_mmu_page. * Hash table of struct kvm_mmu_page.
*/ */
struct list_head active_mmu_pages; struct list_head active_mmu_pages;
struct list_head assigned_dev_head;
struct dmar_domain *intel_iommu_domain;
struct kvm_pic *vpic; struct kvm_pic *vpic;
struct kvm_ioapic *vioapic; struct kvm_ioapic *vioapic;
struct kvm_pit *vpit; struct kvm_pit *vpit;
struct hlist_head irq_ack_notifier_list;
int round_robin_prev_vcpu; int round_robin_prev_vcpu;
unsigned int tss_addr; unsigned int tss_addr;
...@@ -338,6 +374,7 @@ struct kvm_vm_stat { ...@@ -338,6 +374,7 @@ struct kvm_vm_stat {
u32 mmu_flooded; u32 mmu_flooded;
u32 mmu_recycled; u32 mmu_recycled;
u32 mmu_cache_miss; u32 mmu_cache_miss;
u32 mmu_unsync;
u32 remote_tlb_flush; u32 remote_tlb_flush;
u32 lpages; u32 lpages;
}; };
...@@ -364,6 +401,7 @@ struct kvm_vcpu_stat { ...@@ -364,6 +401,7 @@ struct kvm_vcpu_stat {
u32 insn_emulation; u32 insn_emulation;
u32 insn_emulation_fail; u32 insn_emulation_fail;
u32 hypercalls; u32 hypercalls;
u32 irq_injections;
}; };
struct descriptor_table { struct descriptor_table {
...@@ -414,8 +452,7 @@ struct kvm_x86_ops { ...@@ -414,8 +452,7 @@ struct kvm_x86_ops {
unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
int *exception); int *exception);
void (*cache_regs)(struct kvm_vcpu *vcpu); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
void (*decache_regs)(struct kvm_vcpu *vcpu);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
...@@ -528,6 +565,8 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); ...@@ -528,6 +565,8 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
u32 error_code); u32 error_code);
void kvm_pic_set_irq(void *opaque, int irq, int level);
void kvm_inject_nmi(struct kvm_vcpu *vcpu); void kvm_inject_nmi(struct kvm_vcpu *vcpu);
void fx_init(struct kvm_vcpu *vcpu); void fx_init(struct kvm_vcpu *vcpu);
...@@ -550,12 +589,14 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); ...@@ -550,12 +589,14 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_fix_hypercall(struct kvm_vcpu *vcpu); int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_enable_tdp(void); void kvm_enable_tdp(void);
void kvm_disable_tdp(void); void kvm_disable_tdp(void);
...@@ -686,33 +727,6 @@ enum { ...@@ -686,33 +727,6 @@ enum {
TASK_SWITCH_GATE = 3, TASK_SWITCH_GATE = 3,
}; };
#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \
trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
vcpu, 5, d1, d2, d3, d4, d5)
#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \
trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
vcpu, 4, d1, d2, d3, d4, 0)
#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \
trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
vcpu, 3, d1, d2, d3, 0, 0)
#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \
trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
vcpu, 2, d1, d2, 0, 0, 0)
#define KVMTRACE_1D(evt, vcpu, d1, name) \
trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
vcpu, 1, d1, 0, 0, 0, 0)
#define KVMTRACE_0D(evt, vcpu, name) \
trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
vcpu, 0, 0, 0, 0, 0, 0)
#ifdef CONFIG_64BIT
# define KVM_EX_ENTRY ".quad"
# define KVM_EX_PUSH "pushq"
#else
# define KVM_EX_ENTRY ".long"
# define KVM_EX_PUSH "pushl"
#endif
/* /*
* Hardware virtualization extension instructions may fault if a * Hardware virtualization extension instructions may fault if a
* reboot turns off virtualization while processes are running. * reboot turns off virtualization while processes are running.
...@@ -724,11 +738,11 @@ asmlinkage void kvm_handle_fault_on_reboot(void); ...@@ -724,11 +738,11 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
"666: " insn "\n\t" \ "666: " insn "\n\t" \
".pushsection .fixup, \"ax\" \n" \ ".pushsection .fixup, \"ax\" \n" \
"667: \n\t" \ "667: \n\t" \
KVM_EX_PUSH " $666b \n\t" \ __ASM_SIZE(push) " $666b \n\t" \
"jmp kvm_handle_fault_on_reboot \n\t" \ "jmp kvm_handle_fault_on_reboot \n\t" \
".popsection \n\t" \ ".popsection \n\t" \
".pushsection __ex_table, \"a\" \n\t" \ ".pushsection __ex_table, \"a\" \n\t" \
KVM_EX_ENTRY " 666b, 667b \n\t" \ _ASM_PTR " 666b, 667b \n\t" \
".popsection" ".popsection"
#define KVM_ARCH_WANT_MMU_NOTIFIER #define KVM_ARCH_WANT_MMU_NOTIFIER
......
...@@ -178,6 +178,9 @@ ...@@ -178,6 +178,9 @@
#define MSR_IA32_EBL_CR_POWERON 0x0000002a #define MSR_IA32_EBL_CR_POWERON 0x0000002a
#define MSR_IA32_FEATURE_CONTROL 0x0000003a #define MSR_IA32_FEATURE_CONTROL 0x0000003a
#define FEATURE_CONTROL_LOCKED (1<<0)
#define FEATURE_CONTROL_VMXON_ENABLED (1<<2)
#define MSR_IA32_APICBASE 0x0000001b #define MSR_IA32_APICBASE 0x0000001b
#define MSR_IA32_APICBASE_BSP (1<<8) #define MSR_IA32_APICBASE_BSP (1<<8)
#define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_ENABLE (1<<11)
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
/* some helper functions for xen and kvm pv clock sources */ /* some helper functions for xen and kvm pv clock sources */
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
void pvclock_read_wallclock(struct pvclock_wall_clock *wall, void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
struct pvclock_vcpu_time_info *vcpu, struct pvclock_vcpu_time_info *vcpu,
struct timespec *ts); struct timespec *ts);
......
...@@ -25,10 +25,10 @@ ...@@ -25,10 +25,10 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/msi.h> #include <linux/msi.h>
#include <linux/sysdev.h> #include <linux/sysdev.h>
#include "iova.h" #include <linux/iova.h>
#include <linux/io.h> #include <linux/io.h>
#include <linux/dma_remapping.h>
#include <asm/cacheflush.h> #include <asm/cacheflush.h>
#include "dma_remapping.h"
/* /*
* Intel IOMMU register specification per version 1.0 public spec. * Intel IOMMU register specification per version 1.0 public spec.
...@@ -304,4 +304,24 @@ extern int dmar_enable_qi(struct intel_iommu *iommu); ...@@ -304,4 +304,24 @@ extern int dmar_enable_qi(struct intel_iommu *iommu);
extern void qi_global_iec(struct intel_iommu *iommu); extern void qi_global_iec(struct intel_iommu *iommu);
extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
void intel_iommu_domain_exit(struct dmar_domain *domain);
struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev);
int intel_iommu_context_mapping(struct dmar_domain *domain,
struct pci_dev *pdev);
int intel_iommu_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
u64 hpa, size_t size, int prot);
void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn);
struct dmar_domain *intel_iommu_find_domain(struct pci_dev *pdev);
u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova);
#ifdef CONFIG_DMAR
int intel_iommu_found(void);
#else /* CONFIG_DMAR */
static inline int intel_iommu_found(void)
{
return 0;
}
#endif /* CONFIG_DMAR */
#endif #endif
...@@ -311,22 +311,33 @@ struct kvm_s390_interrupt { ...@@ -311,22 +311,33 @@ struct kvm_s390_interrupt {
/* This structure represents a single trace buffer record. */ /* This structure represents a single trace buffer record. */
struct kvm_trace_rec { struct kvm_trace_rec {
__u32 event:28; /* variable rec_val
__u32 extra_u32:3; * is split into:
__u32 cycle_in:1; * bits 0 - 27 -> event id
* bits 28 -30 -> number of extra data args of size u32
* bits 31 -> binary indicator for if tsc is in record
*/
__u32 rec_val;
__u32 pid; __u32 pid;
__u32 vcpu_id; __u32 vcpu_id;
union { union {
struct { struct {
__u64 cycle_u64; __u64 timestamp;
__u32 extra_u32[KVM_TRC_EXTRA_MAX]; __u32 extra_u32[KVM_TRC_EXTRA_MAX];
} __attribute__((packed)) cycle; } __attribute__((packed)) timestamp;
struct { struct {
__u32 extra_u32[KVM_TRC_EXTRA_MAX]; __u32 extra_u32[KVM_TRC_EXTRA_MAX];
} nocycle; } notimestamp;
} u; } u;
}; };
#define TRACE_REC_EVENT_ID(val) \
(0x0fffffff & (val))
#define TRACE_REC_NUM_DATA_ARGS(val) \
(0x70000000 & ((val) << 28))
#define TRACE_REC_TCS(val) \
(0x80000000 & ((val) << 31))
#define KVMIO 0xAE #define KVMIO 0xAE
/* /*
...@@ -372,6 +383,10 @@ struct kvm_trace_rec { ...@@ -372,6 +383,10 @@ struct kvm_trace_rec {
#define KVM_CAP_MP_STATE 14 #define KVM_CAP_MP_STATE 14
#define KVM_CAP_COALESCED_MMIO 15 #define KVM_CAP_COALESCED_MMIO 15
#define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ #define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */
#if defined(CONFIG_X86)||defined(CONFIG_IA64)
#define KVM_CAP_DEVICE_ASSIGNMENT 17
#endif
#define KVM_CAP_IOMMU 18
/* /*
* ioctls for VM fds * ioctls for VM fds
...@@ -401,6 +416,10 @@ struct kvm_trace_rec { ...@@ -401,6 +416,10 @@ struct kvm_trace_rec {
_IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone) _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone)
#define KVM_UNREGISTER_COALESCED_MMIO \ #define KVM_UNREGISTER_COALESCED_MMIO \
_IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone) _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone)
#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
struct kvm_assigned_pci_dev)
#define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
struct kvm_assigned_irq)
/* /*
* ioctls for vcpu fds * ioctls for vcpu fds
...@@ -440,4 +459,45 @@ struct kvm_trace_rec { ...@@ -440,4 +459,45 @@ struct kvm_trace_rec {
#define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state) #define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state)
#define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state) #define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state)
#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02)
#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03)
#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04)
#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05)
#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06)
#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07)
#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08)
#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09)
#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A)
#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B)
#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C)
#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D)
#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E)
#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F)
#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10)
#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11)
#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12)
#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13)
#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14)
#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15)
#define KVM_TRC_GTLB_WRITE (KVM_TRC_HANDLER + 0x16)
#define KVM_TRC_STLB_WRITE (KVM_TRC_HANDLER + 0x17)
#define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18)
#define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19)
struct kvm_assigned_pci_dev {
__u32 assigned_dev_id;
__u32 busnr;
__u32 devfn;
__u32 flags;
};
struct kvm_assigned_irq {
__u32 assigned_dev_id;
__u32 host_irq;
__u32 guest_irq;
__u32 flags;
};
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
#endif #endif
...@@ -34,6 +34,8 @@ ...@@ -34,6 +34,8 @@
#define KVM_REQ_MMU_RELOAD 3 #define KVM_REQ_MMU_RELOAD 3
#define KVM_REQ_TRIPLE_FAULT 4 #define KVM_REQ_TRIPLE_FAULT 4
#define KVM_REQ_PENDING_TIMER 5 #define KVM_REQ_PENDING_TIMER 5
#define KVM_REQ_UNHALT 6
#define KVM_REQ_MMU_SYNC 7
struct kvm_vcpu; struct kvm_vcpu;
extern struct kmem_cache *kvm_vcpu_cache; extern struct kmem_cache *kvm_vcpu_cache;
...@@ -279,12 +281,68 @@ void kvm_free_physmem(struct kvm *kvm); ...@@ -279,12 +281,68 @@ void kvm_free_physmem(struct kvm *kvm);
struct kvm *kvm_arch_create_vm(void); struct kvm *kvm_arch_create_vm(void);
void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_arch_destroy_vm(struct kvm *kvm);
void kvm_free_all_assigned_devices(struct kvm *kvm);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v); int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
int kvm_is_mmio_pfn(pfn_t pfn);
struct kvm_irq_ack_notifier {
struct hlist_node link;
unsigned gsi;
void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
};
struct kvm_assigned_dev_kernel {
struct kvm_irq_ack_notifier ack_notifier;
struct work_struct interrupt_work;
struct list_head list;
int assigned_dev_id;
int host_busnr;
int host_devfn;
int host_irq;
int guest_irq;
int irq_requested;
struct pci_dev *dev;
struct kvm *kvm;
};
void kvm_set_irq(struct kvm *kvm, int irq, int level);
void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
void kvm_register_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
#ifdef CONFIG_DMAR
int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
unsigned long npages);
int kvm_iommu_map_guest(struct kvm *kvm,
struct kvm_assigned_dev_kernel *assigned_dev);
int kvm_iommu_unmap_guest(struct kvm *kvm);
#else /* CONFIG_DMAR */
static inline int kvm_iommu_map_pages(struct kvm *kvm,
gfn_t base_gfn,
unsigned long npages)
{
return 0;
}
static inline int kvm_iommu_map_guest(struct kvm *kvm,
struct kvm_assigned_dev_kernel
*assigned_dev)
{
return -ENODEV;
}
static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
{
return 0;
}
#endif /* CONFIG_DMAR */
static inline void kvm_guest_enter(void) static inline void kvm_guest_enter(void)
{ {
account_system_vtime(current); account_system_vtime(current);
...@@ -307,6 +365,11 @@ static inline gpa_t gfn_to_gpa(gfn_t gfn) ...@@ -307,6 +365,11 @@ static inline gpa_t gfn_to_gpa(gfn_t gfn)
return (gpa_t)gfn << PAGE_SHIFT; return (gpa_t)gfn << PAGE_SHIFT;
} }
static inline hpa_t pfn_to_hpa(pfn_t pfn)
{
return (hpa_t)pfn << PAGE_SHIFT;
}
static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
{ {
set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
...@@ -326,6 +389,25 @@ struct kvm_stats_debugfs_item { ...@@ -326,6 +389,25 @@ struct kvm_stats_debugfs_item {
extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct kvm_stats_debugfs_item debugfs_entries[];
extern struct dentry *kvm_debugfs_dir; extern struct dentry *kvm_debugfs_dir;
#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \
trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
vcpu, 5, d1, d2, d3, d4, d5)
#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \
trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
vcpu, 4, d1, d2, d3, d4, 0)
#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \
trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
vcpu, 3, d1, d2, d3, 0, 0)
#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \
trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
vcpu, 2, d1, d2, 0, 0, 0)
#define KVMTRACE_1D(evt, vcpu, d1, name) \
trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
vcpu, 1, d1, 0, 0, 0, 0)
#define KVMTRACE_0D(evt, vcpu, name) \
trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
vcpu, 0, 0, 0, 0, 0, 0)
#ifdef CONFIG_KVM_TRACE #ifdef CONFIG_KVM_TRACE
int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg); int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg);
void kvm_trace_cleanup(void); void kvm_trace_cleanup(void);
......
...@@ -39,6 +39,7 @@ ...@@ -39,6 +39,7 @@
#include "ioapic.h" #include "ioapic.h"
#include "lapic.h" #include "lapic.h"
#include "irq.h"
#if 0 #if 0
#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
...@@ -285,26 +286,31 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) ...@@ -285,26 +286,31 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
} }
} }
static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi) static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi,
int trigger_mode)
{ {
union ioapic_redir_entry *ent; union ioapic_redir_entry *ent;
ent = &ioapic->redirtbl[gsi]; ent = &ioapic->redirtbl[gsi];
ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
kvm_notify_acked_irq(ioapic->kvm, gsi);
if (trigger_mode == IOAPIC_LEVEL_TRIG) {
ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
ent->fields.remote_irr = 0; ent->fields.remote_irr = 0;
if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
ioapic_service(ioapic, gsi); ioapic_service(ioapic, gsi);
}
} }
void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
{ {
struct kvm_ioapic *ioapic = kvm->arch.vioapic; struct kvm_ioapic *ioapic = kvm->arch.vioapic;
int i; int i;
for (i = 0; i < IOAPIC_NUM_PINS; i++) for (i = 0; i < IOAPIC_NUM_PINS; i++)
if (ioapic->redirtbl[i].fields.vector == vector) if (ioapic->redirtbl[i].fields.vector == vector)
__kvm_ioapic_update_eoi(ioapic, i); __kvm_ioapic_update_eoi(ioapic, i, trigger_mode);
} }
static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr,
...@@ -380,7 +386,7 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, ...@@ -380,7 +386,7 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
break; break;
#ifdef CONFIG_IA64 #ifdef CONFIG_IA64
case IOAPIC_REG_EOI: case IOAPIC_REG_EOI:
kvm_ioapic_update_eoi(ioapic->kvm, data); kvm_ioapic_update_eoi(ioapic->kvm, data, IOAPIC_LEVEL_TRIG);
break; break;
#endif #endif
......
...@@ -58,6 +58,7 @@ struct kvm_ioapic { ...@@ -58,6 +58,7 @@ struct kvm_ioapic {
} redirtbl[IOAPIC_NUM_PINS]; } redirtbl[IOAPIC_NUM_PINS];
struct kvm_io_device dev; struct kvm_io_device dev;
struct kvm *kvm; struct kvm *kvm;
void (*ack_notifier)(void *opaque, int irq);
}; };
#ifdef DEBUG #ifdef DEBUG
...@@ -78,16 +79,9 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) ...@@ -78,16 +79,9 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
return kvm->arch.vioapic; return kvm->arch.vioapic;
} }
#ifdef CONFIG_IA64
static inline int irqchip_in_kernel(struct kvm *kvm)
{
return 1;
}
#endif
struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
unsigned long bitmap); unsigned long bitmap);
void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
int kvm_ioapic_init(struct kvm *kvm); int kvm_ioapic_init(struct kvm *kvm);
void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
void kvm_ioapic_reset(struct kvm_ioapic *ioapic); void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
......
/*
* irq_comm.c: Common API for in kernel interrupt controller
* Copyright (c) 2007, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 59 Temple
* Place - Suite 330, Boston, MA 02111-1307 USA.
* Authors:
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
*
*/
#include <linux/kvm_host.h>
#include "irq.h"
#include "ioapic.h"
/* This should be called with the kvm->lock mutex held */
void kvm_set_irq(struct kvm *kvm, int irq, int level)
{
/* Not possible to detect if the guest uses the PIC or the
* IOAPIC. So set the bit in both. The guest will ignore
* writes to the unused one.
*/
kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level);
#ifdef CONFIG_X86
kvm_pic_set_irq(pic_irqchip(kvm), irq, level);
#endif
}
void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
{
struct kvm_irq_ack_notifier *kian;
struct hlist_node *n;
hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
if (kian->gsi == gsi)
kian->irq_acked(kian);
}
void kvm_register_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian)
{
hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
}
void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian)
{
hlist_del(&kian->link);
}
...@@ -51,6 +51,12 @@ ...@@ -51,6 +51,12 @@
#include "coalesced_mmio.h" #include "coalesced_mmio.h"
#endif #endif
#ifdef KVM_CAP_DEVICE_ASSIGNMENT
#include <linux/pci.h>
#include <linux/interrupt.h>
#include "irq.h"
#endif
MODULE_AUTHOR("Qumranet"); MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
...@@ -71,11 +77,253 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, ...@@ -71,11 +77,253 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
bool kvm_rebooting; bool kvm_rebooting;
#ifdef KVM_CAP_DEVICE_ASSIGNMENT
static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
int assigned_dev_id)
{
struct list_head *ptr;
struct kvm_assigned_dev_kernel *match;
list_for_each(ptr, head) {
match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
if (match->assigned_dev_id == assigned_dev_id)
return match;
}
return NULL;
}
static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
{
struct kvm_assigned_dev_kernel *assigned_dev;
assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
interrupt_work);
/* This is taken to safely inject irq inside the guest. When
* the interrupt injection (or the ioapic code) uses a
* finer-grained lock, update this
*/
mutex_lock(&assigned_dev->kvm->lock);
kvm_set_irq(assigned_dev->kvm,
assigned_dev->guest_irq, 1);
mutex_unlock(&assigned_dev->kvm->lock);
kvm_put_kvm(assigned_dev->kvm);
}
/* FIXME: Implement the OR logic needed to make shared interrupts on
* this line behave properly
*/
static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev =
(struct kvm_assigned_dev_kernel *) dev_id;
kvm_get_kvm(assigned_dev->kvm);
schedule_work(&assigned_dev->interrupt_work);
disable_irq_nosync(irq);
return IRQ_HANDLED;
}
/* Ack the irq line for an assigned device */
static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_assigned_dev_kernel *dev;
if (kian->gsi == -1)
return;
dev = container_of(kian, struct kvm_assigned_dev_kernel,
ack_notifier);
kvm_set_irq(dev->kvm, dev->guest_irq, 0);
enable_irq(dev->host_irq);
}
static void kvm_free_assigned_device(struct kvm *kvm,
struct kvm_assigned_dev_kernel
*assigned_dev)
{
if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
free_irq(assigned_dev->host_irq, (void *)assigned_dev);
kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
if (cancel_work_sync(&assigned_dev->interrupt_work))
/* We had pending work. That means we will have to take
* care of kvm_put_kvm.
*/
kvm_put_kvm(kvm);
pci_release_regions(assigned_dev->dev);
pci_disable_device(assigned_dev->dev);
pci_dev_put(assigned_dev->dev);
list_del(&assigned_dev->list);
kfree(assigned_dev);
}
void kvm_free_all_assigned_devices(struct kvm *kvm)
{
struct list_head *ptr, *ptr2;
struct kvm_assigned_dev_kernel *assigned_dev;
list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
assigned_dev = list_entry(ptr,
struct kvm_assigned_dev_kernel,
list);
kvm_free_assigned_device(kvm, assigned_dev);
}
}
static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
struct kvm_assigned_irq
*assigned_irq)
{
int r = 0;
struct kvm_assigned_dev_kernel *match;
mutex_lock(&kvm->lock);
match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
assigned_irq->assigned_dev_id);
if (!match) {
mutex_unlock(&kvm->lock);
return -EINVAL;
}
if (match->irq_requested) {
match->guest_irq = assigned_irq->guest_irq;
match->ack_notifier.gsi = assigned_irq->guest_irq;
mutex_unlock(&kvm->lock);
return 0;
}
INIT_WORK(&match->interrupt_work,
kvm_assigned_dev_interrupt_work_handler);
if (irqchip_in_kernel(kvm)) {
if (!capable(CAP_SYS_RAWIO)) {
r = -EPERM;
goto out_release;
}
if (assigned_irq->host_irq)
match->host_irq = assigned_irq->host_irq;
else
match->host_irq = match->dev->irq;
match->guest_irq = assigned_irq->guest_irq;
match->ack_notifier.gsi = assigned_irq->guest_irq;
match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
/* Even though this is PCI, we don't want to use shared
* interrupts. Sharing host devices with guest-assigned devices
* on the same interrupt line is not a happy situation: there
* are going to be long delays in accepting, acking, etc.
*/
if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
"kvm_assigned_device", (void *)match)) {
r = -EIO;
goto out_release;
}
}
match->irq_requested = true;
mutex_unlock(&kvm->lock);
return r;
out_release:
mutex_unlock(&kvm->lock);
kvm_free_assigned_device(kvm, match);
return r;
}
static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
struct kvm_assigned_pci_dev *assigned_dev)
{
int r = 0;
struct kvm_assigned_dev_kernel *match;
struct pci_dev *dev;
mutex_lock(&kvm->lock);
match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
assigned_dev->assigned_dev_id);
if (match) {
/* device already assigned */
r = -EINVAL;
goto out;
}
match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
if (match == NULL) {
printk(KERN_INFO "%s: Couldn't allocate memory\n",
__func__);
r = -ENOMEM;
goto out;
}
dev = pci_get_bus_and_slot(assigned_dev->busnr,
assigned_dev->devfn);
if (!dev) {
printk(KERN_INFO "%s: host device not found\n", __func__);
r = -EINVAL;
goto out_free;
}
if (pci_enable_device(dev)) {
printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
r = -EBUSY;
goto out_put;
}
r = pci_request_regions(dev, "kvm_assigned_device");
if (r) {
printk(KERN_INFO "%s: Could not get access to device regions\n",
__func__);
goto out_disable;
}
match->assigned_dev_id = assigned_dev->assigned_dev_id;
match->host_busnr = assigned_dev->busnr;
match->host_devfn = assigned_dev->devfn;
match->dev = dev;
match->kvm = kvm;
list_add(&match->list, &kvm->arch.assigned_dev_head);
if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
r = kvm_iommu_map_guest(kvm, match);
if (r)
goto out_list_del;
}
out:
mutex_unlock(&kvm->lock);
return r;
out_list_del:
list_del(&match->list);
pci_release_regions(dev);
out_disable:
pci_disable_device(dev);
out_put:
pci_dev_put(dev);
out_free:
kfree(match);
mutex_unlock(&kvm->lock);
return r;
}
#endif
static inline int valid_vcpu(int n) static inline int valid_vcpu(int n)
{ {
return likely(n >= 0 && n < KVM_MAX_VCPUS); return likely(n >= 0 && n < KVM_MAX_VCPUS);
} }
inline int kvm_is_mmio_pfn(pfn_t pfn)
{
if (pfn_valid(pfn))
return PageReserved(pfn_to_page(pfn));
return true;
}
/* /*
* Switches to specified vcpu, until a matching vcpu_put() * Switches to specified vcpu, until a matching vcpu_put()
*/ */
...@@ -570,6 +818,12 @@ int __kvm_set_memory_region(struct kvm *kvm, ...@@ -570,6 +818,12 @@ int __kvm_set_memory_region(struct kvm *kvm,
} }
kvm_free_physmem_slot(&old, &new); kvm_free_physmem_slot(&old, &new);
#ifdef CONFIG_DMAR
/* map the pages in iommu page table */
r = kvm_iommu_map_pages(kvm, base_gfn, npages);
if (r)
goto out;
#endif
return 0; return 0;
out_free: out_free:
...@@ -708,9 +962,6 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) ...@@ -708,9 +962,6 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
} }
EXPORT_SYMBOL_GPL(gfn_to_hva); EXPORT_SYMBOL_GPL(gfn_to_hva);
/*
* Requires current->mm->mmap_sem to be held
*/
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
{ {
struct page *page[1]; struct page *page[1];
...@@ -726,21 +977,24 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) ...@@ -726,21 +977,24 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
return page_to_pfn(bad_page); return page_to_pfn(bad_page);
} }
npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page, npages = get_user_pages_fast(addr, 1, 1, page);
NULL);
if (unlikely(npages != 1)) { if (unlikely(npages != 1)) {
struct vm_area_struct *vma; struct vm_area_struct *vma;
down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, addr); vma = find_vma(current->mm, addr);
if (vma == NULL || addr < vma->vm_start || if (vma == NULL || addr < vma->vm_start ||
!(vma->vm_flags & VM_PFNMAP)) { !(vma->vm_flags & VM_PFNMAP)) {
up_read(&current->mm->mmap_sem);
get_page(bad_page); get_page(bad_page);
return page_to_pfn(bad_page); return page_to_pfn(bad_page);
} }
pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
BUG_ON(pfn_valid(pfn)); up_read(&current->mm->mmap_sem);
BUG_ON(!kvm_is_mmio_pfn(pfn));
} else } else
pfn = page_to_pfn(page[0]); pfn = page_to_pfn(page[0]);
...@@ -754,10 +1008,10 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) ...@@ -754,10 +1008,10 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
pfn_t pfn; pfn_t pfn;
pfn = gfn_to_pfn(kvm, gfn); pfn = gfn_to_pfn(kvm, gfn);
if (pfn_valid(pfn)) if (!kvm_is_mmio_pfn(pfn))
return pfn_to_page(pfn); return pfn_to_page(pfn);
WARN_ON(!pfn_valid(pfn)); WARN_ON(kvm_is_mmio_pfn(pfn));
get_page(bad_page); get_page(bad_page);
return bad_page; return bad_page;
...@@ -773,7 +1027,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean); ...@@ -773,7 +1027,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
void kvm_release_pfn_clean(pfn_t pfn) void kvm_release_pfn_clean(pfn_t pfn)
{ {
if (pfn_valid(pfn)) if (!kvm_is_mmio_pfn(pfn))
put_page(pfn_to_page(pfn)); put_page(pfn_to_page(pfn));
} }
EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
...@@ -799,7 +1053,7 @@ EXPORT_SYMBOL_GPL(kvm_set_page_dirty); ...@@ -799,7 +1053,7 @@ EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
void kvm_set_pfn_dirty(pfn_t pfn) void kvm_set_pfn_dirty(pfn_t pfn)
{ {
if (pfn_valid(pfn)) { if (!kvm_is_mmio_pfn(pfn)) {
struct page *page = pfn_to_page(pfn); struct page *page = pfn_to_page(pfn);
if (!PageReserved(page)) if (!PageReserved(page))
SetPageDirty(page); SetPageDirty(page);
...@@ -809,14 +1063,14 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); ...@@ -809,14 +1063,14 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
void kvm_set_pfn_accessed(pfn_t pfn) void kvm_set_pfn_accessed(pfn_t pfn)
{ {
if (pfn_valid(pfn)) if (!kvm_is_mmio_pfn(pfn))
mark_page_accessed(pfn_to_page(pfn)); mark_page_accessed(pfn_to_page(pfn));
} }
EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
void kvm_get_pfn(pfn_t pfn) void kvm_get_pfn(pfn_t pfn)
{ {
if (pfn_valid(pfn)) if (!kvm_is_mmio_pfn(pfn))
get_page(pfn_to_page(pfn)); get_page(pfn_to_page(pfn));
} }
EXPORT_SYMBOL_GPL(kvm_get_pfn); EXPORT_SYMBOL_GPL(kvm_get_pfn);
...@@ -972,12 +1226,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) ...@@ -972,12 +1226,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
for (;;) { for (;;) {
prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
if (kvm_cpu_has_interrupt(vcpu)) if (kvm_cpu_has_interrupt(vcpu) ||
break; kvm_cpu_has_pending_timer(vcpu) ||
if (kvm_cpu_has_pending_timer(vcpu)) kvm_arch_vcpu_runnable(vcpu)) {
break; set_bit(KVM_REQ_UNHALT, &vcpu->requests);
if (kvm_arch_vcpu_runnable(vcpu))
break; break;
}
if (signal_pending(current)) if (signal_pending(current))
break; break;
...@@ -1074,12 +1328,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) ...@@ -1074,12 +1328,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
r = kvm_arch_vcpu_setup(vcpu); r = kvm_arch_vcpu_setup(vcpu);
if (r) if (r)
goto vcpu_destroy; return r;
mutex_lock(&kvm->lock); mutex_lock(&kvm->lock);
if (kvm->vcpus[n]) { if (kvm->vcpus[n]) {
r = -EEXIST; r = -EEXIST;
mutex_unlock(&kvm->lock);
goto vcpu_destroy; goto vcpu_destroy;
} }
kvm->vcpus[n] = vcpu; kvm->vcpus[n] = vcpu;
...@@ -1095,8 +1348,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) ...@@ -1095,8 +1348,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
unlink: unlink:
mutex_lock(&kvm->lock); mutex_lock(&kvm->lock);
kvm->vcpus[n] = NULL; kvm->vcpus[n] = NULL;
mutex_unlock(&kvm->lock);
vcpu_destroy: vcpu_destroy:
mutex_unlock(&kvm->lock);
kvm_arch_vcpu_destroy(vcpu); kvm_arch_vcpu_destroy(vcpu);
return r; return r;
} }
...@@ -1118,6 +1371,8 @@ static long kvm_vcpu_ioctl(struct file *filp, ...@@ -1118,6 +1371,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
struct kvm_vcpu *vcpu = filp->private_data; struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg; void __user *argp = (void __user *)arg;
int r; int r;
struct kvm_fpu *fpu = NULL;
struct kvm_sregs *kvm_sregs = NULL;
if (vcpu->kvm->mm != current->mm) if (vcpu->kvm->mm != current->mm)
return -EIO; return -EIO;
...@@ -1165,25 +1420,28 @@ static long kvm_vcpu_ioctl(struct file *filp, ...@@ -1165,25 +1420,28 @@ static long kvm_vcpu_ioctl(struct file *filp,
break; break;
} }
case KVM_GET_SREGS: { case KVM_GET_SREGS: {
struct kvm_sregs kvm_sregs; kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
r = -ENOMEM;
memset(&kvm_sregs, 0, sizeof kvm_sregs); if (!kvm_sregs)
r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); goto out;
r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
if (r) if (r)
goto out; goto out;
r = -EFAULT; r = -EFAULT;
if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
goto out; goto out;
r = 0; r = 0;
break; break;
} }
case KVM_SET_SREGS: { case KVM_SET_SREGS: {
struct kvm_sregs kvm_sregs; kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
r = -ENOMEM;
if (!kvm_sregs)
goto out;
r = -EFAULT; r = -EFAULT;
if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
goto out; goto out;
r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
if (r) if (r)
goto out; goto out;
r = 0; r = 0;
...@@ -1264,25 +1522,28 @@ static long kvm_vcpu_ioctl(struct file *filp, ...@@ -1264,25 +1522,28 @@ static long kvm_vcpu_ioctl(struct file *filp,
break; break;
} }
case KVM_GET_FPU: { case KVM_GET_FPU: {
struct kvm_fpu fpu; fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
r = -ENOMEM;
memset(&fpu, 0, sizeof fpu); if (!fpu)
r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu); goto out;
r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
if (r) if (r)
goto out; goto out;
r = -EFAULT; r = -EFAULT;
if (copy_to_user(argp, &fpu, sizeof fpu)) if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
goto out; goto out;
r = 0; r = 0;
break; break;
} }
case KVM_SET_FPU: { case KVM_SET_FPU: {
struct kvm_fpu fpu; fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
r = -ENOMEM;
if (!fpu)
goto out;
r = -EFAULT; r = -EFAULT;
if (copy_from_user(&fpu, argp, sizeof fpu)) if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
goto out; goto out;
r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu); r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
if (r) if (r)
goto out; goto out;
r = 0; r = 0;
...@@ -1292,6 +1553,8 @@ static long kvm_vcpu_ioctl(struct file *filp, ...@@ -1292,6 +1553,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
} }
out: out:
kfree(fpu);
kfree(kvm_sregs);
return r; return r;
} }
...@@ -1359,6 +1622,30 @@ static long kvm_vm_ioctl(struct file *filp, ...@@ -1359,6 +1622,30 @@ static long kvm_vm_ioctl(struct file *filp,
r = 0; r = 0;
break; break;
} }
#endif
#ifdef KVM_CAP_DEVICE_ASSIGNMENT
case KVM_ASSIGN_PCI_DEVICE: {
struct kvm_assigned_pci_dev assigned_dev;
r = -EFAULT;
if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
goto out;
r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
if (r)
goto out;
break;
}
case KVM_ASSIGN_IRQ: {
struct kvm_assigned_irq assigned_irq;
r = -EFAULT;
if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
goto out;
r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
if (r)
goto out;
break;
}
#endif #endif
default: default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg); r = kvm_arch_vm_ioctl(filp, ioctl, arg);
...@@ -1369,17 +1656,22 @@ static long kvm_vm_ioctl(struct file *filp, ...@@ -1369,17 +1656,22 @@ static long kvm_vm_ioctl(struct file *filp,
static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{ {
struct page *page[1];
unsigned long addr;
int npages;
gfn_t gfn = vmf->pgoff;
struct kvm *kvm = vma->vm_file->private_data; struct kvm *kvm = vma->vm_file->private_data;
struct page *page;
if (!kvm_is_visible_gfn(kvm, vmf->pgoff)) addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr))
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
page = gfn_to_page(kvm, vmf->pgoff);
if (is_error_page(page)) { npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
kvm_release_page_clean(page); NULL);
if (unlikely(npages != 1))
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
}
vmf->page = page; vmf->page = page[0];
return 0; return 0;
} }
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/relay.h> #include <linux/relay.h>
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/ktime.h>
#include <linux/kvm_host.h> #include <linux/kvm_host.h>
...@@ -35,16 +36,16 @@ static struct kvm_trace *kvm_trace; ...@@ -35,16 +36,16 @@ static struct kvm_trace *kvm_trace;
struct kvm_trace_probe { struct kvm_trace_probe {
const char *name; const char *name;
const char *format; const char *format;
u32 cycle_in; u32 timestamp_in;
marker_probe_func *probe_func; marker_probe_func *probe_func;
}; };
static inline int calc_rec_size(int cycle, int extra) static inline int calc_rec_size(int timestamp, int extra)
{ {
int rec_size = KVM_TRC_HEAD_SIZE; int rec_size = KVM_TRC_HEAD_SIZE;
rec_size += extra; rec_size += extra;
return cycle ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size; return timestamp ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size;
} }
static void kvm_add_trace(void *probe_private, void *call_data, static void kvm_add_trace(void *probe_private, void *call_data,
...@@ -54,12 +55,13 @@ static void kvm_add_trace(void *probe_private, void *call_data, ...@@ -54,12 +55,13 @@ static void kvm_add_trace(void *probe_private, void *call_data,
struct kvm_trace *kt = kvm_trace; struct kvm_trace *kt = kvm_trace;
struct kvm_trace_rec rec; struct kvm_trace_rec rec;
struct kvm_vcpu *vcpu; struct kvm_vcpu *vcpu;
int i, extra, size; int i, size;
u32 extra;
if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING)) if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING))
return; return;
rec.event = va_arg(*args, u32); rec.rec_val = TRACE_REC_EVENT_ID(va_arg(*args, u32));
vcpu = va_arg(*args, struct kvm_vcpu *); vcpu = va_arg(*args, struct kvm_vcpu *);
rec.pid = current->tgid; rec.pid = current->tgid;
rec.vcpu_id = vcpu->vcpu_id; rec.vcpu_id = vcpu->vcpu_id;
...@@ -67,21 +69,21 @@ static void kvm_add_trace(void *probe_private, void *call_data, ...@@ -67,21 +69,21 @@ static void kvm_add_trace(void *probe_private, void *call_data,
extra = va_arg(*args, u32); extra = va_arg(*args, u32);
WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX)); WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX));
extra = min_t(u32, extra, KVM_TRC_EXTRA_MAX); extra = min_t(u32, extra, KVM_TRC_EXTRA_MAX);
rec.extra_u32 = extra;
rec.cycle_in = p->cycle_in; rec.rec_val |= TRACE_REC_TCS(p->timestamp_in)
| TRACE_REC_NUM_DATA_ARGS(extra);
if (rec.cycle_in) { if (p->timestamp_in) {
rec.u.cycle.cycle_u64 = get_cycles(); rec.u.timestamp.timestamp = ktime_to_ns(ktime_get());
for (i = 0; i < rec.extra_u32; i++) for (i = 0; i < extra; i++)
rec.u.cycle.extra_u32[i] = va_arg(*args, u32); rec.u.timestamp.extra_u32[i] = va_arg(*args, u32);
} else { } else {
for (i = 0; i < rec.extra_u32; i++) for (i = 0; i < extra; i++)
rec.u.nocycle.extra_u32[i] = va_arg(*args, u32); rec.u.notimestamp.extra_u32[i] = va_arg(*args, u32);
} }
size = calc_rec_size(rec.cycle_in, rec.extra_u32 * sizeof(u32)); size = calc_rec_size(p->timestamp_in, extra * sizeof(u32));
relay_write(kt->rchan, &rec, size); relay_write(kt->rchan, &rec, size);
} }
......
/*
* Copyright (c) 2006, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 59 Temple
* Place - Suite 330, Boston, MA 02111-1307 USA.
*
* Copyright (C) 2006-2008 Intel Corporation
* Copyright IBM Corporation, 2008
* Author: Allen M. Kay <allen.m.kay@intel.com>
* Author: Weidong Han <weidong.han@intel.com>
* Author: Ben-Ami Yassour <benami@il.ibm.com>
*/
#include <linux/list.h>
#include <linux/kvm_host.h>
#include <linux/pci.h>
#include <linux/dmar.h>
#include <linux/intel-iommu.h>
static int kvm_iommu_unmap_memslots(struct kvm *kvm);
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);
int kvm_iommu_map_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages)
{
gfn_t gfn = base_gfn;
pfn_t pfn;
int i, r = 0;
struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
/* check if iommu exists and in use */
if (!domain)
return 0;
for (i = 0; i < npages; i++) {
/* check if already mapped */
pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
gfn_to_gpa(gfn));
if (pfn)
continue;
pfn = gfn_to_pfn(kvm, gfn);
r = intel_iommu_page_mapping(domain,
gfn_to_gpa(gfn),
pfn_to_hpa(pfn),
PAGE_SIZE,
DMA_PTE_READ |
DMA_PTE_WRITE);
if (r) {
printk(KERN_ERR "kvm_iommu_map_pages:"
"iommu failed to map pfn=%lx\n", pfn);
goto unmap_pages;
}
gfn++;
}
return 0;
unmap_pages:
kvm_iommu_put_pages(kvm, base_gfn, i);
return r;
}
static int kvm_iommu_map_memslots(struct kvm *kvm)
{
int i, r;
down_read(&kvm->slots_lock);
for (i = 0; i < kvm->nmemslots; i++) {
r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn,
kvm->memslots[i].npages);
if (r)
break;
}
up_read(&kvm->slots_lock);
return r;
}
int kvm_iommu_map_guest(struct kvm *kvm,
struct kvm_assigned_dev_kernel *assigned_dev)
{
struct pci_dev *pdev = NULL;
int r;
if (!intel_iommu_found()) {
printk(KERN_ERR "%s: intel iommu not found\n", __func__);
return -ENODEV;
}
printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n",
assigned_dev->host_busnr,
PCI_SLOT(assigned_dev->host_devfn),
PCI_FUNC(assigned_dev->host_devfn));
pdev = assigned_dev->dev;
if (pdev == NULL) {
if (kvm->arch.intel_iommu_domain) {
intel_iommu_domain_exit(kvm->arch.intel_iommu_domain);
kvm->arch.intel_iommu_domain = NULL;
}
return -ENODEV;
}
kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev);
if (!kvm->arch.intel_iommu_domain)
return -ENODEV;
r = kvm_iommu_map_memslots(kvm);
if (r)
goto out_unmap;
intel_iommu_detach_dev(kvm->arch.intel_iommu_domain,
pdev->bus->number, pdev->devfn);
r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain,
pdev);
if (r) {
printk(KERN_ERR "Domain context map for %s failed",
pci_name(pdev));
goto out_unmap;
}
return 0;
out_unmap:
kvm_iommu_unmap_memslots(kvm);
return r;
}
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages)
{
gfn_t gfn = base_gfn;
pfn_t pfn;
struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
int i;
for (i = 0; i < npages; i++) {
pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
gfn_to_gpa(gfn));
kvm_release_pfn_clean(pfn);
gfn++;
}
}
static int kvm_iommu_unmap_memslots(struct kvm *kvm)
{
int i;
down_read(&kvm->slots_lock);
for (i = 0; i < kvm->nmemslots; i++) {
kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn,
kvm->memslots[i].npages);
}
up_read(&kvm->slots_lock);
return 0;
}
int kvm_iommu_unmap_guest(struct kvm *kvm)
{
struct kvm_assigned_dev_kernel *entry;
struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
/* check if iommu exists and in use */
if (!domain)
return 0;
list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) {
printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n",
entry->host_busnr,
PCI_SLOT(entry->host_devfn),
PCI_FUNC(entry->host_devfn));
/* detach kvm dmar domain */
intel_iommu_detach_dev(domain, entry->host_busnr,
entry->host_devfn);
}
kvm_iommu_unmap_memslots(kvm);
intel_iommu_domain_exit(domain);
return 0;
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册