提交 e9bda6f6 编写于 作者: A Avi Kivity

Merge branch 'queue' into next

Merge patches queued during the run-up to the merge window.

* queue: (25 commits)
  KVM: Choose better candidate for directed yield
  KVM: Note down when cpu relax intercepted or pause loop exited
  KVM: Add config to support ple or cpu relax optimzation
  KVM: switch to symbolic name for irq_states size
  KVM: x86: Fix typos in pmu.c
  KVM: x86: Fix typos in lapic.c
  KVM: x86: Fix typos in cpuid.c
  KVM: x86: Fix typos in emulate.c
  KVM: x86: Fix typos in x86.c
  KVM: SVM: Fix typos
  KVM: VMX: Fix typos
  KVM: remove the unused parameter of gfn_to_pfn_memslot
  KVM: remove is_error_hpa
  KVM: make bad_pfn static to kvm_main.c
  KVM: using get_fault_pfn to get the fault pfn
  KVM: MMU: track the refcount when unmap the page
  KVM: x86: remove unnecessary mark_page_dirty
  KVM: MMU: Avoid handling same rmap_pde in kvm_handle_hva_range()
  KVM: MMU: Push trace_kvm_age_page() into kvm_age_rmapp()
  KVM: MMU: Add memslot parameter to hva handlers
  ...
Signed-off-by: NAvi Kivity <avi@redhat.com>
......@@ -52,6 +52,8 @@
struct kvm;
extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
extern int kvm_unmap_hva_range(struct kvm *kvm,
unsigned long start, unsigned long end);
extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
......
......@@ -756,9 +756,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
goto out_put;
}
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
int (*handler)(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn))
static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long start,
unsigned long end,
int (*handler)(struct kvm *kvm,
unsigned long *rmapp,
unsigned long gfn))
{
int ret;
int retval = 0;
......@@ -767,15 +770,25 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
unsigned long start = memslot->userspace_addr;
unsigned long end;
unsigned long hva_start, hva_end;
gfn_t gfn, gfn_end;
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
hva_start = max(start, memslot->userspace_addr);
hva_end = min(end, memslot->userspace_addr +
(memslot->npages << PAGE_SHIFT));
if (hva_start >= hva_end)
continue;
/*
* {gfn(page) | page intersects with [hva_start, hva_end)} =
* {gfn, gfn+1, ..., gfn_end-1}.
*/
gfn = hva_to_gfn_memslot(hva_start, memslot);
gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
for (; gfn < gfn_end; ++gfn) {
gfn_t gfn_offset = gfn - memslot->base_gfn;
ret = handler(kvm, &memslot->rmap[gfn_offset],
memslot->base_gfn + gfn_offset);
ret = handler(kvm, &memslot->rmap[gfn_offset], gfn);
retval |= ret;
}
}
......@@ -783,6 +796,13 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
return retval;
}
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
int (*handler)(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn))
{
return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn)
{
......@@ -850,6 +870,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return 0;
}
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
{
if (kvm->arch.using_mmu_notifiers)
kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
return 0;
}
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn)
{
......
......@@ -520,7 +520,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
if (likely(!pfnmap)) {
unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn);
pfn = gfn_to_pfn_memslot(slot, gfn);
if (is_error_pfn(pfn)) {
printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
(long)gfn);
......
......@@ -21,6 +21,7 @@ config KVM
depends on HAVE_KVM && EXPERIMENTAL
select PREEMPT_NOTIFIERS
select ANON_INODES
select HAVE_KVM_CPU_RELAX_INTERCEPT
---help---
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
......
......@@ -500,11 +500,11 @@ struct kvm_vcpu_arch {
};
struct kvm_lpage_info {
unsigned long rmap_pde;
int write_count;
};
struct kvm_arch_memory_slot {
unsigned long *rmap_pde[KVM_NR_PAGE_SIZES - 1];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
};
......@@ -957,6 +957,7 @@ extern bool kvm_rebooting;
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
......
......@@ -37,6 +37,7 @@ config KVM
select TASK_DELAY_ACCT
select PERF_EVENTS
select HAVE_KVM_MSI
select HAVE_KVM_CPU_RELAX_INTERCEPT
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
......
......@@ -316,7 +316,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
case 7: {
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* Mask ebx against host capbability word 9 */
/* Mask ebx against host capability word 9 */
if (index == 0) {
entry->ebx &= kvm_supported_word9_x86_features;
cpuid_mask(&entry->ebx, 9);
......
......@@ -642,7 +642,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
goto bad;
} else {
/* exapand-down segment */
/* expand-down segment */
if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
goto bad;
lim = desc.d ? 0xffffffff : 0xffff;
......@@ -1383,7 +1383,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
err_code = selector & 0xfffc;
err_vec = GP_VECTOR;
/* can't load system descriptor into segment selecor */
/* can't load system descriptor into segment selector */
if (seg <= VCPU_SREG_GS && !seg_desc.s)
goto exception;
......@@ -2398,7 +2398,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
/*
* Now load segment descriptors. If fault happenes at this stage
* Now load segment descriptors. If fault happens at this stage
* it is handled in a context of new task
*/
ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
......@@ -2640,7 +2640,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
*
* 1. jmp/call/int to task gate: Check against DPL of the task gate
* 2. Exception/IRQ/iret: No check is performed
* 3. jmp/call to TSS: Check agains DPL of the TSS
* 3. jmp/call to TSS: Check against DPL of the TSS
*/
if (reason == TASK_SWITCH_GATE) {
if (idt_index != -1) {
......@@ -2681,7 +2681,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
/* set back link to prev task only if NT bit is set in eflags
note that old_tss_sel is not used afetr this point */
note that old_tss_sel is not used after this point */
if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
old_tss_sel = 0xffff;
......
......@@ -70,7 +70,7 @@ struct kvm_pic {
struct kvm_io_device dev_slave;
struct kvm_io_device dev_eclr;
void (*ack_notifier)(void *opaque, int irq);
unsigned long irq_states[16];
unsigned long irq_states[PIC_NUM_PINS];
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
......
......@@ -719,7 +719,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
{
unsigned char alignment = offset & 0xf;
u32 result;
/* this bitmask has a bit cleared for each reserver register */
/* this bitmask has a bit cleared for each reserved register */
static const u64 rmask = 0x43ff01ffffffe70cULL;
if ((alignment + len) > 4) {
......@@ -792,7 +792,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
atomic_set(&apic->lapic_timer.pending, 0);
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
/* lapic timer in oneshot or peroidic mode */
/* lapic timer in oneshot or periodic mode */
now = apic->lapic_timer.timer.base->get_time();
apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
* APIC_BUS_CYCLE_NS * apic->divide_count;
......
......@@ -556,6 +556,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
return 0;
pfn = spte_to_pfn(old_spte);
/*
* KVM does not hold the refcount of the page used by
* kvm mmu, before reclaiming the page, we should
* unmap it from mmu first.
*/
WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
......@@ -960,13 +968,13 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
{
struct kvm_lpage_info *linfo;
unsigned long idx;
if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
linfo = lpage_info_slot(gfn, slot, level);
return &linfo->rmap_pde;
idx = gfn_to_index(gfn, slot->base_gfn, level);
return &slot->arch.rmap_pde[level - PT_DIRECTORY_LEVEL][idx];
}
/*
......@@ -1200,7 +1208,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data)
struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
......@@ -1218,7 +1226,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
}
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data)
struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
......@@ -1259,43 +1267,67 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
return 0;
}
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
unsigned long data,
int (*handler)(struct kvm *kvm, unsigned long *rmapp,
unsigned long data))
static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long start,
unsigned long end,
unsigned long data,
int (*handler)(struct kvm *kvm,
unsigned long *rmapp,
struct kvm_memory_slot *slot,
unsigned long data))
{
int j;
int ret;
int retval = 0;
int ret = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
unsigned long start = memslot->userspace_addr;
unsigned long end;
unsigned long hva_start, hva_end;
gfn_t gfn_start, gfn_end;
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
gfn_t gfn = memslot->base_gfn + gfn_offset;
hva_start = max(start, memslot->userspace_addr);
hva_end = min(end, memslot->userspace_addr +
(memslot->npages << PAGE_SHIFT));
if (hva_start >= hva_end)
continue;
/*
* {gfn(page) | page intersects with [hva_start, hva_end)} =
* {gfn_start, gfn_start+1, ..., gfn_end-1}.
*/
gfn_start = hva_to_gfn_memslot(hva_start, memslot);
gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = PT_PAGE_TABLE_LEVEL;
j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
unsigned long idx, idx_end;
unsigned long *rmapp;
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
struct kvm_lpage_info *linfo;
/*
* {idx(page_j) | page_j intersects with
* [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
*/
idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
linfo = lpage_info_slot(gfn, memslot,
PT_DIRECTORY_LEVEL + j);
ret |= handler(kvm, &linfo->rmap_pde, data);
}
trace_kvm_age_page(hva, memslot, ret);
retval |= ret;
rmapp = __gfn_to_rmap(gfn_start, j, memslot);
for (; idx <= idx_end; ++idx)
ret |= handler(kvm, rmapp++, memslot, data);
}
}
return retval;
return ret;
}
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
unsigned long data,
int (*handler)(struct kvm *kvm, unsigned long *rmapp,
struct kvm_memory_slot *slot,
unsigned long data))
{
return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
}
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
......@@ -1303,13 +1335,18 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
}
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
{
return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
}
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
}
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data)
struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator uninitialized_var(iter);
......@@ -1323,8 +1360,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
* This has some overhead, but not as much as the cost of swapping
* out actively used pages or breaking up actively used hugepages.
*/
if (!shadow_accessed_mask)
return kvm_unmap_rmapp(kvm, rmapp, data);
if (!shadow_accessed_mask) {
young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
goto out;
}
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) {
......@@ -1336,12 +1375,14 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
(unsigned long *)sptep);
}
}
out:
/* @data has hva passed to kvm_age_hva(). */
trace_kvm_age_page(data, slot, young);
return young;
}
static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data)
struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
......@@ -1379,13 +1420,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
}
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
}
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
......@@ -2472,14 +2513,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
unsigned long hva;
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
if (!slot) {
get_page(fault_page);
return page_to_pfn(fault_page);
}
if (!slot)
return get_fault_pfn();
hva = gfn_to_hva_memslot(slot, gfn);
return hva_to_pfn_atomic(vcpu->kvm, hva);
return hva_to_pfn_atomic(hva);
}
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
......
/*
* Kernel-based Virtual Machine -- Performane Monitoring Unit support
* Kernel-based Virtual Machine -- Performance Monitoring Unit support
*
* Copyright 2011 Red Hat, Inc. and/or its affiliates.
*
......
......@@ -2063,7 +2063,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
if (svm->nested.intercept & 1ULL) {
/*
* The #vmexit can't be emulated here directly because this
* code path runs with irqs and preemtion disabled. A
* code path runs with irqs and preemption disabled. A
* #vmexit emulation might sleep. Only signal request for
* the #vmexit here.
*/
......@@ -2409,7 +2409,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
/*
* This function merges the msr permission bitmaps of kvm and the
* nested vmcb. It is omptimized in that it only merges the parts where
* nested vmcb. It is optimized in that it only merges the parts where
* the kvm msr permission bitmap may contain zero bits
*/
int i;
......
......@@ -1343,7 +1343,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
guest_efer = vmx->vcpu.arch.efer;
/*
* NX is emulated; LMA and LME handled by hardware; SCE meaninless
* NX is emulated; LMA and LME handled by hardware; SCE meaningless
* outside long mode
*/
ignore_bits = EFER_NX | EFER_SCE;
......@@ -3261,7 +3261,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
* qemu binaries.
* IA32 arch specifies that at the time of processor reset the
* "Accessed" bit in the AR field of segment registers is 1. And qemu
* is setting it to 0 in the usedland code. This causes invalid guest
* is setting it to 0 in the userland code. This causes invalid guest
* state vmexit when "unrestricted guest" mode is turned on.
* Fix for this setup issue in cpu_reset is being pushed in the qemu
* tree. Newer qemu binaries with that qemu fix would not need this
......@@ -4446,7 +4446,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
/* called to set cr0 as approriate for a mov-to-cr0 exit. */
/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
{
if (to_vmx(vcpu)->nested.vmxon &&
......
......@@ -1093,7 +1093,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
* For each generation, we track the original measured
* nanosecond time, offset, and write, so if TSCs are in
* sync, we can match exact offset, and if not, we can match
* exact software computaion in compute_guest_tsc()
* exact software computation in compute_guest_tsc()
*
* These values are tracked in kvm->arch.cur_xxx variables.
*/
......@@ -1500,7 +1500,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
/* Bits 2:5 are resrved, Should be zero */
/* Bits 2:5 are reserved, Should be zero */
if (data & 0x3c)
return 1;
......@@ -1723,7 +1723,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
* Ignore all writes to this no longer documented MSR.
* Writes are only relevant for old K7 processors,
* all pre-dating SVM, but a recommended workaround from
* AMD for these chips. It is possible to speicify the
* AMD for these chips. It is possible to specify the
* affected processor models on the command line, hence
* the need to ignore the workaround.
*/
......@@ -2632,7 +2632,6 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
if (!vcpu->arch.time_page)
return -EINVAL;
src->flags |= PVCLOCK_GUEST_STOPPED;
mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
return 0;
}
......@@ -4492,7 +4491,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
/*
* if emulation was due to access to shadowed page table
* and it failed try to unshadow page and re-entetr the
* and it failed try to unshadow page and re-enter the
* guest to let CPU execute the instruction.
*/
if (kvm_mmu_unprotect_page_virt(vcpu, gva))
......@@ -5588,7 +5587,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
/*
* We are here if userspace calls get_regs() in the middle of
* instruction emulation. Registers state needs to be copied
* back from emulation context to vcpu. Usrapace shouldn't do
* back from emulation context to vcpu. Userspace shouldn't do
* that usually, but some bad designed PV devices (vmware
* backdoor interface) need this to work
*/
......@@ -6117,7 +6116,7 @@ int kvm_arch_hardware_enable(void *garbage)
* as we reset last_host_tsc on all VCPUs to stop this from being
* called multiple times (one for each physical CPU bringup).
*
* Platforms with unnreliable TSCs don't have to deal with this, they
* Platforms with unreliable TSCs don't have to deal with this, they
* will be compensated by the logic in vcpu_load, which sets the TSC to
* catchup mode. This will catchup all VCPUs to real time, but cannot
* guarantee that they stay in perfect synchronization.
......@@ -6314,6 +6313,10 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
int i;
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
if (!dont || free->arch.rmap_pde[i] != dont->arch.rmap_pde[i]) {
kvm_kvfree(free->arch.rmap_pde[i]);
free->arch.rmap_pde[i] = NULL;
}
if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
kvm_kvfree(free->arch.lpage_info[i]);
free->arch.lpage_info[i] = NULL;
......@@ -6333,6 +6336,11 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
lpages = gfn_to_index(slot->base_gfn + npages - 1,
slot->base_gfn, level) + 1;
slot->arch.rmap_pde[i] =
kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap_pde[i]));
if (!slot->arch.rmap_pde[i])
goto out_free;
slot->arch.lpage_info[i] =
kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
if (!slot->arch.lpage_info[i])
......@@ -6361,7 +6369,9 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
out_free:
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
kvm_kvfree(slot->arch.rmap_pde[i]);
kvm_kvfree(slot->arch.lpage_info[i]);
slot->arch.rmap_pde[i] = NULL;
slot->arch.lpage_info[i] = NULL;
}
return -ENOMEM;
......@@ -6381,7 +6391,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
map_flags = MAP_SHARED | MAP_ANONYMOUS;
/*To keep backward compatibility with older userspace,
*x86 needs to hanlde !user_alloc case.
*x86 needs to handle !user_alloc case.
*/
if (!user_alloc) {
if (npages && !old.rmap) {
......
......@@ -183,6 +183,18 @@ struct kvm_vcpu {
} async_pf;
#endif
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
/*
* Cpu relax intercept or pause loop exit optimization
* in_spin_loop: set when a vcpu does a pause loop exit
* or cpu relax intercepted.
* dy_eligible: indicates whether vcpu is eligible for directed yield.
*/
struct {
bool in_spin_loop;
bool dy_eligible;
} spin_loop;
#endif
struct kvm_vcpu_arch arch;
};
......@@ -378,20 +390,11 @@ id_to_memslot(struct kvm_memslots *slots, int id)
return slot;
}
#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
extern struct page *bad_page;
extern struct page *fault_page;
extern pfn_t bad_pfn;
extern pfn_t fault_pfn;
int is_error_page(struct page *page);
int is_error_pfn(pfn_t pfn);
int is_hwpoison_pfn(pfn_t pfn);
int is_fault_pfn(pfn_t pfn);
int is_noslot_pfn(pfn_t pfn);
int is_invalid_pfn(pfn_t pfn);
int kvm_is_error_hva(unsigned long addr);
......@@ -427,20 +430,20 @@ void kvm_release_page_dirty(struct page *page);
void kvm_set_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);
pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
pfn_t hva_to_pfn_atomic(unsigned long addr);
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
bool write_fault, bool *writable);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_release_pfn_dirty(pfn_t);
void kvm_release_pfn_clean(pfn_t pfn);
void kvm_set_pfn_dirty(pfn_t pfn);
void kvm_set_pfn_accessed(pfn_t pfn);
void kvm_get_pfn(pfn_t pfn);
pfn_t get_fault_pfn(void);
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int len);
......@@ -740,6 +743,14 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
}
static inline gfn_t
hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
{
gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT;
return slot->base_gfn + gfn_offset;
}
static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
gfn_t gfn)
{
......@@ -899,5 +910,32 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
}
}
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
{
vcpu->spin_loop.in_spin_loop = val;
}
static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
{
vcpu->spin_loop.dy_eligible = val;
}
#else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
{
}
static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
{
}
static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
{
return true;
}
#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
#endif
......@@ -21,3 +21,6 @@ config KVM_ASYNC_PF
config HAVE_KVM_MSI
bool
config HAVE_KVM_CPU_RELAX_INTERCEPT
bool
......@@ -42,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);
static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long size)
static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
unsigned long size)
{
gfn_t end_gfn;
pfn_t pfn;
pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
pfn = gfn_to_pfn_memslot(slot, gfn);
end_gfn = gfn + (size >> PAGE_SHIFT);
gfn += 1;
......@@ -56,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
return pfn;
while (gfn < end_gfn)
gfn_to_pfn_memslot(kvm, slot, gfn++);
gfn_to_pfn_memslot(slot, gfn++);
return pfn;
}
......@@ -105,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
* Pin all pages we are about to map in memory. This is
* important because we unmap and unpin in 4kb steps later.
*/
pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
pfn = kvm_pin_pages(slot, gfn, page_size);
if (is_error_pfn(pfn)) {
gfn += 1;
continue;
......
......@@ -321,11 +321,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_MASTER:
e->set = kvm_set_pic_irq;
max_pin = 16;
max_pin = PIC_NUM_PINS;
break;
case KVM_IRQCHIP_PIC_SLAVE:
e->set = kvm_set_pic_irq;
max_pin = 16;
max_pin = PIC_NUM_PINS;
delta = 8;
break;
case KVM_IRQCHIP_IOAPIC:
......
......@@ -100,11 +100,14 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
static bool largepages_enabled = true;
struct page *bad_page;
static pfn_t bad_pfn;
static struct page *hwpoison_page;
static pfn_t hwpoison_pfn;
struct page *fault_page;
pfn_t fault_pfn;
static struct page *fault_page;
static pfn_t fault_pfn;
inline int kvm_is_mmio_pfn(pfn_t pfn)
{
......@@ -236,6 +239,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
}
vcpu->run = page_address(page);
kvm_vcpu_set_in_spin_loop(vcpu, false);
kvm_vcpu_set_dy_eligible(vcpu, false);
r = kvm_arch_vcpu_init(vcpu);
if (r < 0)
goto fail_free_run;
......@@ -332,8 +338,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
for (; start < end; start += PAGE_SIZE)
need_tlb_flush |= kvm_unmap_hva(kvm, start);
need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
need_tlb_flush |= kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
......@@ -950,12 +955,6 @@ int is_hwpoison_pfn(pfn_t pfn)
}
EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
int is_fault_pfn(pfn_t pfn)
{
return pfn == fault_pfn;
}
EXPORT_SYMBOL_GPL(is_fault_pfn);
int is_noslot_pfn(pfn_t pfn)
{
return pfn == bad_pfn;
......@@ -1039,11 +1038,12 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
static pfn_t get_fault_pfn(void)
pfn_t get_fault_pfn(void)
{
get_page(fault_page);
return fault_pfn;
}
EXPORT_SYMBOL_GPL(get_fault_pfn);
int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int write, struct page **page)
......@@ -1065,8 +1065,8 @@ static inline int check_user_page_hwpoison(unsigned long addr)
return rc == -EHWPOISON;
}
static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
bool *async, bool write_fault, bool *writable)
static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
bool write_fault, bool *writable)
{
struct page *page[1];
int npages = 0;
......@@ -1146,9 +1146,9 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
return pfn;
}
pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
pfn_t hva_to_pfn_atomic(unsigned long addr)
{
return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
return hva_to_pfn(addr, true, NULL, true, NULL);
}
EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
......@@ -1166,7 +1166,7 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
return page_to_pfn(bad_page);
}
return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
return hva_to_pfn(addr, atomic, async, write_fault, writable);
}
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
......@@ -1195,11 +1195,10 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn)
pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
{
unsigned long addr = gfn_to_hva_memslot(slot, gfn);
return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
return hva_to_pfn(addr, false, NULL, true, NULL);
}
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
......@@ -1580,6 +1579,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
/*
* Helper that checks whether a VCPU is eligible for directed yield.
* Most eligible candidate to yield is decided by following heuristics:
*
* (a) VCPU which has not done pl-exit or cpu relax intercepted recently
* (preempted lock holder), indicated by @in_spin_loop.
* Set at the beiginning and cleared at the end of interception/PLE handler.
*
* (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
* chance last time (mostly it has become eligible now since we have probably
* yielded to lockholder in last iteration. This is done by toggling
* @dy_eligible each time a VCPU checked for eligibility.)
*
* Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
* to preempted lock-holder could result in wrong VCPU selection and CPU
* burning. Giving priority for a potential lock-holder increases lock
* progress.
*
* Since algorithm is based on heuristics, accessing another VCPU data without
* locking does not harm. It may result in trying to yield to same VCPU, fail
* and continue with next VCPU and so on.
*/
bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
{
bool eligible;
eligible = !vcpu->spin_loop.in_spin_loop ||
(vcpu->spin_loop.in_spin_loop &&
vcpu->spin_loop.dy_eligible);
if (vcpu->spin_loop.in_spin_loop)
kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
return eligible;
}
#endif
void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{
struct kvm *kvm = me->kvm;
......@@ -1589,6 +1625,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
int pass;
int i;
kvm_vcpu_set_in_spin_loop(me, true);
/*
* We boost the priority of a VCPU that is runnable but not
* currently running, because it got preempted by something
......@@ -1607,6 +1644,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
continue;
if (waitqueue_active(&vcpu->wq))
continue;
if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
continue;
if (kvm_vcpu_yield_to(vcpu)) {
kvm->last_boosted_vcpu = i;
yielded = 1;
......@@ -1614,6 +1653,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
}
}
}
kvm_vcpu_set_in_spin_loop(me, false);
/* Ensure vcpu is not eligible during next spinloop */
kvm_vcpu_set_dy_eligible(me, false);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
......@@ -2697,9 +2740,6 @@ static struct syscore_ops kvm_syscore_ops = {
.resume = kvm_resume,
};
struct page *bad_page;
pfn_t bad_pfn;
static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册