提交 49c13b51 编写于 作者: L Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (80 commits)
  KVM: Use CPU_DYING for disabling virtualization
  KVM: Tune hotplug/suspend IPIs
  KVM: Keep track of which cpus have virtualization enabled
  SMP: Allow smp_call_function_single() to current cpu
  i386: Allow smp_call_function_single() to current cpu
  x86_64: Allow smp_call_function_single() to current cpu
  HOTPLUG: Adapt thermal throttle to CPU_DYING
  HOTPLUG: Adapt cpuset hotplug callback to CPU_DYING
  HOTPLUG: Add CPU_DYING notifier
  KVM: Clean up #includes
  KVM: Remove kvmfs in favor of the anonymous inodes source
  KVM: SVM: Reliably detect if SVM was disabled by BIOS
  KVM: VMX: Remove unnecessary code in vmx_tlb_flush()
  KVM: MMU: Fix Wrong tlb flush order
  KVM: VMX: Reinitialize the real-mode tss when entering real mode
  KVM: Avoid useless memory write when possible
  KVM: Fix x86 emulator writeback
  KVM: Add support for in-kernel pio handlers
  KVM: VMX: Fix interrupt checking on lightweight exit
  KVM: Adds support for in-kernel mmio handlers
  ...
......@@ -134,19 +134,21 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
int err;
sys_dev = get_cpu_sysdev(cpu);
mutex_lock(&therm_cpu_lock);
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
mutex_lock(&therm_cpu_lock);
err = thermal_throttle_add_dev(sys_dev);
mutex_unlock(&therm_cpu_lock);
WARN_ON(err);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
mutex_lock(&therm_cpu_lock);
thermal_throttle_remove_dev(sys_dev);
mutex_unlock(&therm_cpu_lock);
break;
}
mutex_unlock(&therm_cpu_lock);
return NOTIFY_OK;
}
......
......@@ -47,7 +47,7 @@ int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
EXPORT_SYMBOL(smp_call_function);
/**
* smp_call_function_single - Run a function on another CPU
* smp_call_function_single - Run a function on a specific CPU
* @cpu: The target CPU. Cannot be the calling CPU.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
......@@ -66,9 +66,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int ret;
int me = get_cpu();
if (cpu == me) {
WARN_ON(1);
local_irq_disable();
func(info);
local_irq_enable();
put_cpu();
return -EBUSY;
return 0;
}
ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
......
......@@ -357,7 +357,7 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info,
}
/*
* smp_call_function_single - Run a function on another CPU
* smp_call_function_single - Run a function on a specific CPU
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @nonatomic: Currently unused.
......@@ -374,14 +374,18 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
{
/* prevent preemption and reschedule on another processor */
int me = get_cpu();
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
if (cpu == me) {
local_irq_disable();
func(info);
local_irq_enable();
put_cpu();
return 0;
}
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
spin_lock_bh(&call_lock);
__smp_call_function_single(cpu, func, info, nonatomic, wait);
spin_unlock_bh(&call_lock);
......
#
# KVM configuration
#
menu "Virtualization"
menuconfig VIRTUALIZATION
bool "Virtualization"
depends on X86
default y
if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on X86 && EXPERIMENTAL
depends on X86_CMPXCHG64 || 64BIT
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
......@@ -35,4 +40,4 @@ config KVM_AMD
Provides support for KVM on AMD processors equipped with the AMD-V
(SVM) extensions.
endmenu
endif # VIRTUALIZATION
......@@ -10,6 +10,8 @@
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <asm/signal.h>
......@@ -18,6 +20,7 @@
#include <linux/kvm_para.h>
#define CR0_PE_MASK (1ULL << 0)
#define CR0_MP_MASK (1ULL << 1)
#define CR0_TS_MASK (1ULL << 3)
#define CR0_NE_MASK (1ULL << 5)
#define CR0_WP_MASK (1ULL << 16)
......@@ -42,7 +45,8 @@
(CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \
| CR0_NW_MASK | CR0_CD_MASK)
#define KVM_VM_CR0_ALWAYS_ON \
(CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK)
(CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK \
| CR0_MP_MASK)
#define KVM_GUEST_CR4_MASK \
(CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK)
#define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK)
......@@ -51,10 +55,10 @@
#define INVALID_PAGE (~(hpa_t)0)
#define UNMAPPED_GVA (~(gpa_t)0)
#define KVM_MAX_VCPUS 1
#define KVM_MAX_VCPUS 4
#define KVM_ALIAS_SLOTS 4
#define KVM_MEMORY_SLOTS 4
#define KVM_NUM_MMU_PAGES 256
#define KVM_NUM_MMU_PAGES 1024
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 40
......@@ -79,6 +83,11 @@
#define KVM_PIO_PAGE_OFFSET 1
/*
* vcpu->requests bit members
*/
#define KVM_TLB_FLUSH 0
/*
* Address types:
*
......@@ -137,7 +146,7 @@ struct kvm_mmu_page {
gfn_t gfn;
union kvm_mmu_page_role role;
hpa_t page_hpa;
u64 *spt;
unsigned long slot_bitmap; /* One bit set per slot which has memory
* in this shadow page.
*/
......@@ -232,6 +241,7 @@ struct kvm_pio_request {
struct page *guest_pages[2];
unsigned guest_page_offset;
int in;
int port;
int size;
int string;
int down;
......@@ -252,8 +262,70 @@ struct kvm_stat {
u32 halt_exits;
u32 request_irq_exits;
u32 irq_exits;
u32 light_exits;
u32 efer_reload;
};
struct kvm_io_device {
void (*read)(struct kvm_io_device *this,
gpa_t addr,
int len,
void *val);
void (*write)(struct kvm_io_device *this,
gpa_t addr,
int len,
const void *val);
int (*in_range)(struct kvm_io_device *this, gpa_t addr);
void (*destructor)(struct kvm_io_device *this);
void *private;
};
static inline void kvm_iodevice_read(struct kvm_io_device *dev,
gpa_t addr,
int len,
void *val)
{
dev->read(dev, addr, len, val);
}
static inline void kvm_iodevice_write(struct kvm_io_device *dev,
gpa_t addr,
int len,
const void *val)
{
dev->write(dev, addr, len, val);
}
static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
{
return dev->in_range(dev, addr);
}
static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
{
if (dev->destructor)
dev->destructor(dev);
}
/*
* It would be nice to use something smarter than a linear search, TBD...
* Thankfully we dont expect many devices to register (famous last words :),
* so until then it will suffice. At least its abstracted so we can change
* in one place.
*/
struct kvm_io_bus {
int dev_count;
#define NR_IOBUS_DEVS 6
struct kvm_io_device *devs[NR_IOBUS_DEVS];
};
void kvm_io_bus_init(struct kvm_io_bus *bus);
void kvm_io_bus_destroy(struct kvm_io_bus *bus);
struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
struct kvm_io_device *dev);
struct kvm_vcpu {
struct kvm *kvm;
union {
......@@ -266,6 +338,8 @@ struct kvm_vcpu {
u64 host_tsc;
struct kvm_run *run;
int interrupt_window_open;
int guest_mode;
unsigned long requests;
unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
unsigned long irq_pending[NR_IRQ_WORDS];
......@@ -285,15 +359,20 @@ struct kvm_vcpu {
u64 apic_base;
u64 ia32_misc_enable_msr;
int nmsrs;
int save_nmsrs;
int msr_offset_efer;
#ifdef CONFIG_X86_64
int msr_offset_kernel_gs_base;
#endif
struct vmx_msr_entry *guest_msrs;
struct vmx_msr_entry *host_msrs;
struct list_head free_pages;
struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES];
struct kvm_mmu mmu;
struct kvm_mmu_memory_cache mmu_pte_chain_cache;
struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
struct kvm_mmu_memory_cache mmu_page_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;
gfn_t last_pt_write_gfn;
int last_pt_write_count;
......@@ -305,6 +384,11 @@ struct kvm_vcpu {
char *guest_fx_image;
int fpu_active;
int guest_fpu_loaded;
struct vmx_host_state {
int loaded;
u16 fs_sel, gs_sel, ldt_sel;
int fs_gs_ldt_reload_needed;
} vmx_host_state;
int mmio_needed;
int mmio_read_completed;
......@@ -331,6 +415,7 @@ struct kvm_vcpu {
u32 ar;
} tr, es, ds, fs, gs;
} rmode;
int halt_request; /* real mode on Intel only */
int cpuid_nent;
struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
......@@ -362,12 +447,15 @@ struct kvm {
struct list_head active_mmu_pages;
int n_free_mmu_pages;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
int nvcpus;
struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
int memory_config_version;
int busy;
unsigned long rmap_overflow;
struct list_head vm_list;
struct file *filp;
struct kvm_io_bus mmio_bus;
struct kvm_io_bus pio_bus;
};
struct descriptor_table {
......@@ -488,6 +576,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int size, unsigned long count, int string, int down,
gva_t address, int rep, unsigned port);
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
int emulate_clts(struct kvm_vcpu *vcpu);
int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
......@@ -511,6 +600,7 @@ void save_msrs(struct vmx_msr_entry *e, int n);
void kvm_resched(struct kvm_vcpu *vcpu);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_flush_remote_tlbs(struct kvm *kvm);
int kvm_read_guest(struct kvm_vcpu *vcpu,
gva_t addr,
......@@ -524,10 +614,12 @@ int kvm_write_guest(struct kvm_vcpu *vcpu,
unsigned long segment_base(u16 selector);
void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *old, const u8 *new, int bytes);
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
......@@ -539,6 +631,14 @@ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
return vcpu->mmu.page_fault(vcpu, gva, error_code);
}
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
return 0;
return kvm_mmu_load(vcpu);
}
static inline int is_long_mode(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
......
此差异已折叠。
......@@ -16,15 +16,18 @@
* the COPYING file in the top-level directory.
*
*/
#include "vmx.h"
#include "kvm.h"
#include <linux/types.h>
#include <linux/string.h>
#include <asm/page.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/module.h>
#include "vmx.h"
#include "kvm.h"
#include <asm/page.h>
#include <asm/cmpxchg.h>
#undef MMU_DEBUG
......@@ -90,25 +93,11 @@ static int dbg = 1;
#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
#define PT32_PTE_COPY_MASK \
(PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK)
#define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK)
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
#define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
#define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
#define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
#define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
#define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
#define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
#define PT64_LEVEL_BITS 9
......@@ -165,6 +154,8 @@ struct kvm_rmap_desc {
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_cache;
static struct kmem_cache *mmu_page_header_cache;
static int is_write_protection(struct kvm_vcpu *vcpu)
{
......@@ -202,6 +193,15 @@ static int is_rmap_pte(u64 pte)
== (PT_WRITABLE_MASK | PT_PRESENT_MASK);
}
static void set_shadow_pte(u64 *sptep, u64 spte)
{
#ifdef CONFIG_X86_64
set_64bit((unsigned long *)sptep, spte);
#else
set_64bit((unsigned long long *)sptep, spte);
#endif
}
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min,
gfp_t gfp_flags)
......@@ -235,6 +235,14 @@ static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags)
goto out;
r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
rmap_desc_cache, 1, gfp_flags);
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->mmu_page_cache,
mmu_page_cache, 4, gfp_flags);
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
mmu_page_header_cache, 4, gfp_flags);
out:
return r;
}
......@@ -258,6 +266,8 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
mmu_free_memory_cache(&vcpu->mmu_page_cache);
mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
}
static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
......@@ -433,19 +443,18 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
BUG_ON(!(*spte & PT_WRITABLE_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
rmap_remove(vcpu, spte);
kvm_arch_ops->tlb_flush(vcpu);
*spte &= ~(u64)PT_WRITABLE_MASK;
set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
kvm_flush_remote_tlbs(vcpu->kvm);
}
}
#ifdef MMU_DEBUG
static int is_empty_shadow_page(hpa_t page_hpa)
static int is_empty_shadow_page(u64 *spt)
{
u64 *pos;
u64 *end;
for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64);
pos != end; pos++)
for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
if (*pos != 0) {
printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
pos, *pos);
......@@ -455,13 +464,13 @@ static int is_empty_shadow_page(hpa_t page_hpa)
}
#endif
static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
static void kvm_mmu_free_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page_head)
{
struct kvm_mmu_page *page_head = page_header(page_hpa);
ASSERT(is_empty_shadow_page(page_hpa));
page_head->page_hpa = page_hpa;
list_move(&page_head->link, &vcpu->free_pages);
ASSERT(is_empty_shadow_page(page_head->spt));
list_del(&page_head->link);
mmu_memory_cache_free(&vcpu->mmu_page_cache, page_head->spt);
mmu_memory_cache_free(&vcpu->mmu_page_header_cache, page_head);
++vcpu->kvm->n_free_mmu_pages;
}
......@@ -475,12 +484,15 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
{
struct kvm_mmu_page *page;
if (list_empty(&vcpu->free_pages))
if (!vcpu->kvm->n_free_mmu_pages)
return NULL;
page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
list_move(&page->link, &vcpu->kvm->active_mmu_pages);
ASSERT(is_empty_shadow_page(page->page_hpa));
page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
sizeof *page);
page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
set_page_private(virt_to_page(page->spt), (unsigned long)page);
list_add(&page->link, &vcpu->kvm->active_mmu_pages);
ASSERT(is_empty_shadow_page(page->spt));
page->slot_bitmap = 0;
page->multimapped = 0;
page->parent_pte = parent_pte;
......@@ -638,7 +650,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
u64 *pt;
u64 ent;
pt = __va(page->page_hpa);
pt = page->spt;
if (page->role.level == PT_PAGE_TABLE_LEVEL) {
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
......@@ -646,7 +658,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
rmap_remove(vcpu, &pt[i]);
pt[i] = 0;
}
kvm_arch_ops->tlb_flush(vcpu);
kvm_flush_remote_tlbs(vcpu->kvm);
return;
}
......@@ -659,6 +671,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
ent &= PT64_BASE_ADDR_MASK;
mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]);
}
kvm_flush_remote_tlbs(vcpu->kvm);
}
static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
......@@ -685,12 +698,12 @@ static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
}
BUG_ON(!parent_pte);
kvm_mmu_put_page(vcpu, page, parent_pte);
*parent_pte = 0;
set_shadow_pte(parent_pte, 0);
}
kvm_mmu_page_unlink_children(vcpu, page);
if (!page->root_count) {
hlist_del(&page->hash_link);
kvm_mmu_free_page(vcpu, page->page_hpa);
kvm_mmu_free_page(vcpu, page);
} else
list_move(&page->link, &vcpu->kvm->active_mmu_pages);
}
......@@ -717,6 +730,17 @@ static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
return r;
}
static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
{
struct kvm_mmu_page *page;
while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
pgprintk("%s: zap %lx %x\n",
__FUNCTION__, gfn, page->role.word);
kvm_mmu_zap_page(vcpu, page);
}
}
static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
{
int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
......@@ -805,7 +829,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
return -ENOMEM;
}
table[index] = new_table->page_hpa | PT_PRESENT_MASK
table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
| PT_WRITABLE_MASK | PT_USER_MASK;
}
table_addr = table[index] & PT64_BASE_ADDR_MASK;
......@@ -817,11 +841,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
int i;
struct kvm_mmu_page *page;
if (!VALID_PAGE(vcpu->mmu.root_hpa))
return;
#ifdef CONFIG_X86_64
if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->mmu.root_hpa;
ASSERT(VALID_PAGE(root));
page = page_header(root);
--page->root_count;
vcpu->mmu.root_hpa = INVALID_PAGE;
......@@ -832,7 +857,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->mmu.pae_root[i];
if (root) {
ASSERT(VALID_PAGE(root));
root &= PT64_BASE_ADDR_MASK;
page = page_header(root);
--page->root_count;
......@@ -857,7 +881,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
ASSERT(!VALID_PAGE(root));
page = kvm_mmu_get_page(vcpu, root_gfn, 0,
PT64_ROOT_LEVEL, 0, 0, NULL);
root = page->page_hpa;
root = __pa(page->spt);
++page->root_count;
vcpu->mmu.root_hpa = root;
return;
......@@ -878,7 +902,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, !is_paging(vcpu),
0, NULL);
root = page->page_hpa;
root = __pa(page->spt);
++page->root_count;
vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
}
......@@ -928,9 +952,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
context->free = nonpaging_free;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
mmu_alloc_roots(vcpu);
ASSERT(VALID_PAGE(context->root_hpa));
kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
context->root_hpa = INVALID_PAGE;
return 0;
}
......@@ -944,59 +966,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
mmu_free_roots(vcpu);
if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
kvm_mmu_free_some_pages(vcpu);
mmu_alloc_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
}
static inline void set_pte_common(struct kvm_vcpu *vcpu,
u64 *shadow_pte,
gpa_t gaddr,
int dirty,
u64 access_bits,
gfn_t gfn)
{
hpa_t paddr;
*shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
if (!dirty)
access_bits &= ~PT_WRITABLE_MASK;
paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
*shadow_pte |= access_bits;
if (is_error_hpa(paddr)) {
*shadow_pte |= gaddr;
*shadow_pte |= PT_SHADOW_IO_MARK;
*shadow_pte &= ~PT_PRESENT_MASK;
return;
}
*shadow_pte |= paddr;
if (access_bits & PT_WRITABLE_MASK) {
struct kvm_mmu_page *shadow;
shadow = kvm_mmu_lookup_page(vcpu, gfn);
if (shadow) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, gfn);
access_bits &= ~PT_WRITABLE_MASK;
if (is_writeble_pte(*shadow_pte)) {
*shadow_pte &= ~PT_WRITABLE_MASK;
kvm_arch_ops->tlb_flush(vcpu);
}
}
}
if (access_bits & PT_WRITABLE_MASK)
mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
rmap_add(vcpu, shadow_pte);
}
static void inject_page_fault(struct kvm_vcpu *vcpu,
......@@ -1006,23 +975,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
}
static inline int fix_read_pf(u64 *shadow_ent)
{
if ((*shadow_ent & PT_SHADOW_USER_MASK) &&
!(*shadow_ent & PT_USER_MASK)) {
/*
* If supervisor write protect is disabled, we shadow kernel
* pages as user pages so we can trap the write access.
*/
*shadow_ent |= PT_USER_MASK;
*shadow_ent &= ~PT_WRITABLE_MASK;
return 1;
}
return 0;
}
static void paging_free(struct kvm_vcpu *vcpu)
{
nonpaging_free(vcpu);
......@@ -1047,10 +999,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
context->free = paging_free;
context->root_level = level;
context->shadow_root_level = level;
mmu_alloc_roots(vcpu);
ASSERT(VALID_PAGE(context->root_hpa));
kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
(vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
context->root_hpa = INVALID_PAGE;
return 0;
}
......@@ -1069,10 +1018,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
context->free = paging_free;
context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
mmu_alloc_roots(vcpu);
ASSERT(VALID_PAGE(context->root_hpa));
kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
(vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
context->root_hpa = INVALID_PAGE;
return 0;
}
......@@ -1106,19 +1052,34 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
}
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
destroy_kvm_mmu(vcpu);
return init_kvm_mmu(vcpu);
}
int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;
destroy_kvm_mmu(vcpu);
r = init_kvm_mmu(vcpu);
if (r < 0)
goto out;
spin_lock(&vcpu->kvm->lock);
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
mmu_alloc_roots(vcpu);
kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu);
out:
spin_unlock(&vcpu->kvm->lock);
return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
}
static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu,
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page,
u64 *spte)
{
......@@ -1135,9 +1096,25 @@ static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu,
}
}
*spte = 0;
kvm_flush_remote_tlbs(vcpu->kvm);
}
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page,
u64 *spte,
const void *new, int bytes)
{
if (page->role.level != PT_PAGE_TABLE_LEVEL)
return;
if (page->role.glevels == PT32_ROOT_LEVEL)
paging32_update_pte(vcpu, page, spte, new, bytes);
else
paging64_update_pte(vcpu, page, spte, new, bytes);
}
void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *old, const u8 *new, int bytes)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *page;
......@@ -1149,6 +1126,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
unsigned pte_size;
unsigned page_offset;
unsigned misaligned;
unsigned quadrant;
int level;
int flooded = 0;
int npte;
......@@ -1169,6 +1147,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
continue;
pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
misaligned |= bytes < 4;
if (misaligned || flooded) {
/*
* Misaligned accesses are too much trouble to fix
......@@ -1200,21 +1179,20 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
page_offset <<= 1;
npte = 2;
}
quadrant = page_offset >> PAGE_SHIFT;
page_offset &= ~PAGE_MASK;
if (quadrant != page->role.quadrant)
continue;
}
spte = __va(page->page_hpa);
spte += page_offset / sizeof(*spte);
spte = &page->spt[page_offset / sizeof(*spte)];
while (npte--) {
mmu_pre_write_zap_pte(vcpu, page, spte);
mmu_pte_write_zap_pte(vcpu, page, spte);
mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
++spte;
}
}
}
void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
{
}
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
......@@ -1243,13 +1221,6 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu)
struct kvm_mmu_page, link);
kvm_mmu_zap_page(vcpu, page);
}
while (!list_empty(&vcpu->free_pages)) {
page = list_entry(vcpu->free_pages.next,
struct kvm_mmu_page, link);
list_del(&page->link);
__free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
page->page_hpa = INVALID_PAGE;
}
free_page((unsigned long)vcpu->mmu.pae_root);
}
......@@ -1260,18 +1231,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
ASSERT(vcpu);
for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
INIT_LIST_HEAD(&page_header->link);
if ((page = alloc_page(GFP_KERNEL)) == NULL)
goto error_1;
set_page_private(page, (unsigned long)page_header);
page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
list_add(&page_header->link, &vcpu->free_pages);
++vcpu->kvm->n_free_mmu_pages;
}
vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
/*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
......@@ -1296,7 +1256,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
ASSERT(list_empty(&vcpu->free_pages));
return alloc_mmu_pages(vcpu);
}
......@@ -1305,7 +1264,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
ASSERT(!list_empty(&vcpu->free_pages));
return init_kvm_mmu(vcpu);
}
......@@ -1331,7 +1289,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot)
if (!test_bit(slot, &page->slot_bitmap))
continue;
pt = __va(page->page_hpa);
pt = page->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
if (pt[i] & PT_WRITABLE_MASK) {
......@@ -1354,7 +1312,7 @@ void kvm_mmu_zap_all(struct kvm_vcpu *vcpu)
}
mmu_free_memory_caches(vcpu);
kvm_arch_ops->tlb_flush(vcpu);
kvm_flush_remote_tlbs(vcpu->kvm);
init_kvm_mmu(vcpu);
}
......@@ -1364,6 +1322,10 @@ void kvm_mmu_module_exit(void)
kmem_cache_destroy(pte_chain_cache);
if (rmap_desc_cache)
kmem_cache_destroy(rmap_desc_cache);
if (mmu_page_cache)
kmem_cache_destroy(mmu_page_cache);
if (mmu_page_header_cache)
kmem_cache_destroy(mmu_page_header_cache);
}
int kvm_mmu_module_init(void)
......@@ -1379,6 +1341,18 @@ int kvm_mmu_module_init(void)
if (!rmap_desc_cache)
goto nomem;
mmu_page_cache = kmem_cache_create("kvm_mmu_page",
PAGE_SIZE,
PAGE_SIZE, 0, NULL, NULL);
if (!mmu_page_cache)
goto nomem;
mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
sizeof(struct kvm_mmu_page),
0, 0, NULL, NULL);
if (!mmu_page_header_cache)
goto nomem;
return 0;
nomem:
......@@ -1482,7 +1456,7 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu)
int i;
list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
u64 *pt = __va(page->page_hpa);
u64 *pt = page->spt;
if (page->role.level != PT_PAGE_TABLE_LEVEL)
continue;
......
......@@ -31,7 +31,6 @@
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
#define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
#ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS 4
#else
......@@ -46,7 +45,6 @@
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
#define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
#define PT_MAX_FULL_LEVELS 2
#else
#error Invalid PTTYPE value
......@@ -192,40 +190,143 @@ static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
}
static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte,
u64 *shadow_pte, u64 access_bits, gfn_t gfn)
static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
u64 *shadow_pte,
gpa_t gaddr,
pt_element_t *gpte,
u64 access_bits,
int user_fault,
int write_fault,
int *ptwrite,
struct guest_walker *walker,
gfn_t gfn)
{
ASSERT(*shadow_pte == 0);
access_bits &= guest_pte;
*shadow_pte = (guest_pte & PT_PTE_COPY_MASK);
set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK,
guest_pte & PT_DIRTY_MASK, access_bits, gfn);
hpa_t paddr;
int dirty = *gpte & PT_DIRTY_MASK;
u64 spte = *shadow_pte;
int was_rmapped = is_rmap_pte(spte);
pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
" user_fault %d gfn %lx\n",
__FUNCTION__, spte, (u64)*gpte, access_bits,
write_fault, user_fault, gfn);
if (write_fault && !dirty) {
*gpte |= PT_DIRTY_MASK;
dirty = 1;
FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
}
spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
spte |= *gpte & PT64_NX_MASK;
if (!dirty)
access_bits &= ~PT_WRITABLE_MASK;
paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
spte |= PT_PRESENT_MASK;
if (access_bits & PT_USER_MASK)
spte |= PT_USER_MASK;
if (is_error_hpa(paddr)) {
spte |= gaddr;
spte |= PT_SHADOW_IO_MARK;
spte &= ~PT_PRESENT_MASK;
set_shadow_pte(shadow_pte, spte);
return;
}
spte |= paddr;
if ((access_bits & PT_WRITABLE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) {
struct kvm_mmu_page *shadow;
spte |= PT_WRITABLE_MASK;
if (user_fault) {
mmu_unshadow(vcpu, gfn);
goto unshadowed;
}
shadow = kvm_mmu_lookup_page(vcpu, gfn);
if (shadow) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, gfn);
access_bits &= ~PT_WRITABLE_MASK;
if (is_writeble_pte(spte)) {
spte &= ~PT_WRITABLE_MASK;
kvm_arch_ops->tlb_flush(vcpu);
}
if (write_fault)
*ptwrite = 1;
}
}
unshadowed:
if (access_bits & PT_WRITABLE_MASK)
mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
set_shadow_pte(shadow_pte, spte);
page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
if (!was_rmapped)
rmap_add(vcpu, shadow_pte);
}
static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde,
u64 *shadow_pte, u64 access_bits, gfn_t gfn)
static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte,
u64 *shadow_pte, u64 access_bits,
int user_fault, int write_fault, int *ptwrite,
struct guest_walker *walker, gfn_t gfn)
{
access_bits &= *gpte;
FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK,
gpte, access_bits, user_fault, write_fault,
ptwrite, walker, gfn);
}
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
u64 *spte, const void *pte, int bytes)
{
pt_element_t gpte;
if (bytes < sizeof(pt_element_t))
return;
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
return;
pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
0, NULL, NULL,
(gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
}
static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde,
u64 *shadow_pte, u64 access_bits,
int user_fault, int write_fault, int *ptwrite,
struct guest_walker *walker, gfn_t gfn)
{
gpa_t gaddr;
ASSERT(*shadow_pte == 0);
access_bits &= guest_pde;
access_bits &= *gpde;
gaddr = (gpa_t)gfn << PAGE_SHIFT;
if (PTTYPE == 32 && is_cpuid_PSE36())
gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) <<
gaddr |= (*gpde & PT32_DIR_PSE36_MASK) <<
(32 - PT32_DIR_PSE36_SHIFT);
*shadow_pte = guest_pde & PT_PTE_COPY_MASK;
set_pte_common(vcpu, shadow_pte, gaddr,
guest_pde & PT_DIRTY_MASK, access_bits, gfn);
FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
gpde, access_bits, user_fault, write_fault,
ptwrite, walker, gfn);
}
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *walker)
struct guest_walker *walker,
int user_fault, int write_fault, int *ptwrite)
{
hpa_t shadow_addr;
int level;
u64 *shadow_ent;
u64 *prev_shadow_ent = NULL;
pt_element_t *guest_ent = walker->ptep;
......@@ -242,37 +343,23 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
for (; ; level--) {
u32 index = SHADOW_PT_INDEX(addr, level);
u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index;
struct kvm_mmu_page *shadow_page;
u64 shadow_pte;
int metaphysical;
gfn_t table_gfn;
unsigned hugepage_access = 0;
shadow_ent = ((u64 *)__va(shadow_addr)) + index;
if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
if (level == PT_PAGE_TABLE_LEVEL)
return shadow_ent;
break;
shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
prev_shadow_ent = shadow_ent;
continue;
}
if (level == PT_PAGE_TABLE_LEVEL) {
if (walker->level == PT_DIRECTORY_LEVEL) {
if (prev_shadow_ent)
*prev_shadow_ent |= PT_SHADOW_PS_MARK;
FNAME(set_pde)(vcpu, *guest_ent, shadow_ent,
walker->inherited_ar,
walker->gfn);
} else {
ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
FNAME(set_pte)(vcpu, *guest_ent, shadow_ent,
walker->inherited_ar,
walker->gfn);
}
return shadow_ent;
}
if (level == PT_PAGE_TABLE_LEVEL)
break;
if (level - 1 == PT_PAGE_TABLE_LEVEL
&& walker->level == PT_DIRECTORY_LEVEL) {
......@@ -289,90 +376,24 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
metaphysical, hugepage_access,
shadow_ent);
shadow_addr = shadow_page->page_hpa;
shadow_addr = __pa(shadow_page->spt);
shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
| PT_WRITABLE_MASK | PT_USER_MASK;
*shadow_ent = shadow_pte;
prev_shadow_ent = shadow_ent;
}
}
/*
* The guest faulted for write. We need to
*
* - check write permissions
* - update the guest pte dirty bit
* - update our own dirty page tracking structures
*/
static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
u64 *shadow_ent,
struct guest_walker *walker,
gva_t addr,
int user,
int *write_pt)
{
pt_element_t *guest_ent;
int writable_shadow;
gfn_t gfn;
struct kvm_mmu_page *page;
if (is_writeble_pte(*shadow_ent))
return !user || (*shadow_ent & PT_USER_MASK);
writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK;
if (user) {
/*
* User mode access. Fail if it's a kernel page or a read-only
* page.
*/
if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow)
return 0;
ASSERT(*shadow_ent & PT_USER_MASK);
} else
/*
* Kernel mode access. Fail if it's a read-only page and
* supervisor write protection is enabled.
*/
if (!writable_shadow) {
if (is_write_protection(vcpu))
return 0;
*shadow_ent &= ~PT_USER_MASK;
}
guest_ent = walker->ptep;
if (!is_present_pte(*guest_ent)) {
*shadow_ent = 0;
return 0;
if (walker->level == PT_DIRECTORY_LEVEL) {
FNAME(set_pde)(vcpu, guest_ent, shadow_ent,
walker->inherited_ar, user_fault, write_fault,
ptwrite, walker, walker->gfn);
} else {
ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
FNAME(set_pte)(vcpu, guest_ent, shadow_ent,
walker->inherited_ar, user_fault, write_fault,
ptwrite, walker, walker->gfn);
}
gfn = walker->gfn;
if (user) {
/*
* Usermode page faults won't be for page table updates.
*/
while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
pgprintk("%s: zap %lx %x\n",
__FUNCTION__, gfn, page->role.word);
kvm_mmu_zap_page(vcpu, page);
}
} else if (kvm_mmu_lookup_page(vcpu, gfn)) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, gfn);
mark_page_dirty(vcpu->kvm, gfn);
FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
*guest_ent |= PT_DIRTY_MASK;
*write_pt = 1;
return 0;
}
mark_page_dirty(vcpu->kvm, gfn);
*shadow_ent |= PT_WRITABLE_MASK;
FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
*guest_ent |= PT_DIRTY_MASK;
rmap_add(vcpu, shadow_ent);
return 1;
return shadow_ent;
}
/*
......@@ -397,7 +418,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
int fetch_fault = error_code & PFERR_FETCH_MASK;
struct guest_walker walker;
u64 *shadow_pte;
int fixed;
int write_pt = 0;
int r;
......@@ -421,27 +441,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
pgprintk("%s: guest page fault\n", __FUNCTION__);
inject_page_fault(vcpu, addr, walker.error_code);
FNAME(release_walker)(&walker);
vcpu->last_pt_write_count = 0; /* reset fork detector */
return 0;
}
shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__,
shadow_pte, *shadow_pte);
/*
* Update the shadow pte.
*/
if (write_fault)
fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr,
user_fault, &write_pt);
else
fixed = fix_read_pf(shadow_pte);
pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__,
shadow_pte, *shadow_pte);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
&write_pt);
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
shadow_pte, *shadow_pte, write_pt);
FNAME(release_walker)(&walker);
if (!write_pt)
vcpu->last_pt_write_count = 0; /* reset fork detector */
/*
* mmio: emulate if accessible, otherwise its a guest fault.
*/
......@@ -478,7 +491,5 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
#undef PT_INDEX
#undef SHADOW_PT_INDEX
#undef PT_LEVEL_MASK
#undef PT_PTE_COPY_MASK
#undef PT_NON_PTE_COPY_MASK
#undef PT_DIR_BASE_ADDR_MASK
#undef PT_MAX_FULL_LEVELS
......@@ -14,16 +14,17 @@
*
*/
#include "kvm_svm.h"
#include "x86_emulate.h"
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/profile.h>
#include <linux/sched.h>
#include <asm/desc.h>
#include "kvm_svm.h"
#include "x86_emulate.h"
#include <asm/desc.h>
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
......@@ -378,7 +379,7 @@ static __init int svm_hardware_setup(void)
int cpu;
struct page *iopm_pages;
struct page *msrpm_pages;
void *msrpm_va;
void *iopm_va, *msrpm_va;
int r;
kvm_emulator_want_group7_invlpg();
......@@ -387,8 +388,10 @@ static __init int svm_hardware_setup(void)
if (!iopm_pages)
return -ENOMEM;
memset(page_address(iopm_pages), 0xff,
PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
iopm_va = page_address(iopm_pages);
memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
......@@ -579,7 +582,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
goto out2;
vcpu->svm->vmcb = page_address(page);
memset(vcpu->svm->vmcb, 0, PAGE_SIZE);
clear_page(vcpu->svm->vmcb);
vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
vcpu->svm->asid_generation = 0;
memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs));
......@@ -587,9 +590,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
fx_init(vcpu);
vcpu->fpu_active = 1;
vcpu->apic_base = 0xfee00000 |
/*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
MSR_IA32_APICBASE_ENABLE;
vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (vcpu == &vcpu->kvm->vcpus[0])
vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
return 0;
......@@ -955,7 +958,7 @@ static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
* VMCB is undefined after a SHUTDOWN intercept
* so reinitialize it.
*/
memset(vcpu->svm->vmcb, 0, PAGE_SIZE);
clear_page(vcpu->svm->vmcb);
init_vmcb(vcpu->svm->vmcb);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
......@@ -1113,12 +1116,7 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1;
skip_emulated_instruction(vcpu);
if (vcpu->irq_summary)
return 1;
kvm_run->exit_reason = KVM_EXIT_HLT;
++vcpu->stat.halt_exits;
return 0;
return kvm_emulate_halt(vcpu);
}
static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
......@@ -1473,6 +1471,11 @@ static void load_db_regs(unsigned long *db_regs)
asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
}
static void svm_flush_tlb(struct kvm_vcpu *vcpu)
{
force_new_asid(vcpu);
}
static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
u16 fs_selector;
......@@ -1481,11 +1484,20 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r;
again:
r = kvm_mmu_reload(vcpu);
if (unlikely(r))
return r;
if (!vcpu->mmio_read_completed)
do_interrupt_requests(vcpu, kvm_run);
clgi();
vcpu->guest_mode = 1;
if (vcpu->requests)
if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
svm_flush_tlb(vcpu);
pre_svm_run(vcpu);
save_host_msrs(vcpu);
......@@ -1617,6 +1629,8 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
#endif
: "cc", "memory" );
vcpu->guest_mode = 0;
if (vcpu->fpu_active) {
fx_save(vcpu->guest_fx_image);
fx_restore(vcpu->host_fx_image);
......@@ -1681,11 +1695,6 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return r;
}
static void svm_flush_tlb(struct kvm_vcpu *vcpu)
{
force_new_asid(vcpu);
}
static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
vcpu->svm->vmcb->save.cr3 = root;
......@@ -1727,6 +1736,12 @@ static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
static int is_disabled(void)
{
u64 vm_cr;
rdmsrl(MSR_VM_CR, vm_cr);
if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
return 1;
return 0;
}
......
......@@ -175,8 +175,11 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_CPUID_FUNC 0x8000000a
#define MSR_EFER_SVME_MASK (1ULL << 12)
#define MSR_VM_CR 0xc0010114
#define MSR_VM_HSAVE_PA 0xc0010117ULL
#define SVM_VM_CR_SVM_DISABLE 4
#define SVM_SELECTOR_S_SHIFT 4
#define SVM_SELECTOR_DPL_SHIFT 5
#define SVM_SELECTOR_P_SHIFT 7
......
此差异已折叠。
......@@ -98,8 +98,11 @@ static u8 opcode_table[256] = {
0, 0, 0, 0,
/* 0x40 - 0x4F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x50 - 0x5F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x50 - 0x57 */
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x58 - 0x5F */
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
/* 0x60 - 0x6F */
0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
......@@ -128,9 +131,9 @@ static u8 opcode_table[256] = {
/* 0xB0 - 0xBF */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xC0 - 0xC7 */
ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, 0,
0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov,
DstMem | SrcImm | ModRM | Mov,
ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
0, ImplicitOps, 0, 0,
ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
/* 0xC8 - 0xCF */
0, 0, 0, 0, 0, 0, 0, 0,
/* 0xD0 - 0xD7 */
......@@ -143,7 +146,8 @@ static u8 opcode_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xF0 - 0xF7 */
0, 0, 0, 0,
0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
ImplicitOps, 0,
ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
/* 0xF8 - 0xFF */
0, 0, 0, 0,
0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
......@@ -152,7 +156,7 @@ static u8 opcode_table[256] = {
static u16 twobyte_table[256] = {
/* 0x00 - 0x0F */
0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
0, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
/* 0x10 - 0x1F */
0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
/* 0x20 - 0x2F */
......@@ -481,6 +485,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
int mode = ctxt->mode;
unsigned long modrm_ea;
int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
int no_wb = 0;
/* Shadow copy of register state. Committed on successful emulation. */
unsigned long _regs[NR_VCPU_REGS];
......@@ -1047,7 +1052,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
_regs[VCPU_REGS_RSP]),
&dst.val, dst.bytes, ctxt)) != 0)
goto done;
dst.val = dst.orig_val; /* skanky: disable writeback */
no_wb = 1;
break;
default:
goto cannot_emulate;
......@@ -1056,7 +1061,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
}
writeback:
if ((d & Mov) || (dst.orig_val != dst.val)) {
if (!no_wb) {
switch (dst.type) {
case OP_REG:
/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
......@@ -1149,6 +1154,23 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
case 0xae ... 0xaf: /* scas */
DPRINTF("Urk! I don't handle SCAS.\n");
goto cannot_emulate;
case 0xf4: /* hlt */
ctxt->vcpu->halt_request = 1;
goto done;
case 0xc3: /* ret */
dst.ptr = &_eip;
goto pop_instruction;
case 0x58 ... 0x5f: /* pop reg */
dst.ptr = (unsigned long *)&_regs[b & 0x7];
pop_instruction:
if ((rc = ops->read_std(register_address(ctxt->ss_base,
_regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt)) != 0)
goto done;
register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
no_wb = 1; /* Disable writeback. */
break;
}
goto writeback;
......@@ -1302,8 +1324,10 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
twobyte_special_insn:
/* Disable writeback. */
dst.orig_val = dst.val;
no_wb = 1;
switch (b) {
case 0x09: /* wbinvd */
break;
case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */
break;
......
......@@ -139,6 +139,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
put_filp(file);
return error;
}
EXPORT_SYMBOL_GPL(anon_inode_getfd);
/*
* A single inode exists for all anon_inode files. Contrary to pipes,
......
......@@ -13,7 +13,6 @@
#define HPFS_SUPER_MAGIC 0xf995e849
#define ISOFS_SUPER_MAGIC 0x9660
#define JFFS2_SUPER_MAGIC 0x72b6
#define KVMFS_SUPER_MAGIC 0x19700426
#define ANON_INODE_FS_MAGIC 0x09041934
#define MINIX_SUPER_MAGIC 0x137F /* original minix fs */
......
......@@ -196,6 +196,8 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */
#define CPU_LOCK_ACQUIRE 0x0008 /* Acquire all hotcpu locks */
#define CPU_LOCK_RELEASE 0x0009 /* Release all hotcpu locks */
#define CPU_DYING 0x000A /* CPU (unsigned)v not running any task,
* not handling interrupts, soon dead */
/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
* operation in progress
......@@ -208,6 +210,7 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
#define CPU_DOWN_PREPARE_FROZEN (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN)
#define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
#define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN)
#define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN)
#endif /* __KERNEL__ */
#endif /* _LINUX_NOTIFIER_H */
......@@ -7,6 +7,7 @@
*/
#include <linux/errno.h>
#include <asm/system.h>
extern void cpu_idle(void);
......@@ -102,7 +103,11 @@ static inline void smp_send_reschedule(int cpu) { }
static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
void *info, int retry, int wait)
{
return -EBUSY;
WARN_ON(cpuid != 0);
local_irq_disable();
func(info);
local_irq_enable();
return 0;
}
#endif /* !SMP */
......
......@@ -103,11 +103,19 @@ static inline void check_for_tasks(int cpu)
write_unlock_irq(&tasklist_lock);
}
struct take_cpu_down_param {
unsigned long mod;
void *hcpu;
};
/* Take this CPU down. */
static int take_cpu_down(void *unused)
static int take_cpu_down(void *_param)
{
struct take_cpu_down_param *param = _param;
int err;
raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
param->hcpu);
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
......@@ -127,6 +135,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
cpumask_t old_allowed, tmp;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct take_cpu_down_param tcd_param = {
.mod = mod,
.hcpu = hcpu,
};
if (num_online_cpus() == 1)
return -EBUSY;
......@@ -153,7 +165,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
set_cpus_allowed(current, tmp);
mutex_lock(&cpu_bitmask_lock);
p = __stop_machine_run(take_cpu_down, NULL, cpu);
p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
mutex_unlock(&cpu_bitmask_lock);
if (IS_ERR(p) || cpu_online(cpu)) {
......
......@@ -2138,6 +2138,9 @@ static void common_cpu_mem_hotplug_unplug(void)
static int cpuset_handle_cpuhp(struct notifier_block *nb,
unsigned long phase, void *cpu)
{
if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
return NOTIFY_DONE;
common_cpu_mem_hotplug_unplug();
return 0;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册