提交 d94d1400 编写于 作者: L Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull more KVM updates from Paolo Bonzini:
 "x86:

   - take into account HVA before retrying on MMU notifier race

   - fixes for nested AMD guests without NPT

   - allow INVPCID in guest without PCID

   - disable PML in hardware when not in use

   - MMU code cleanups:

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (28 commits)
  KVM: SVM: Fix nested VM-Exit on #GP interception handling
  KVM: vmx/pmu: Fix dummy check if lbr_desc->event is created
  KVM: x86/mmu: Consider the hva in mmu_notifier retry
  KVM: x86/mmu: Skip mmu_notifier check when handling MMIO page fault
  KVM: Documentation: rectify rst markup in KVM_GET_SUPPORTED_HV_CPUID
  KVM: nSVM: prepare guest save area while is_guest_mode is true
  KVM: x86/mmu: Remove a variety of unnecessary exports
  KVM: x86: Fold "write-protect large" use case into generic write-protect
  KVM: x86/mmu: Don't set dirty bits when disabling dirty logging w/ PML
  KVM: VMX: Dynamically enable/disable PML based on memslot dirty logging
  KVM: x86: Further clarify the logic and comments for toggling log dirty
  KVM: x86: Move MMU's PML logic to common code
  KVM: x86/mmu: Make dirty log size hook (PML) a value, not a function
  KVM: x86/mmu: Expand on the comment in kvm_vcpu_ad_need_write_protect()
  KVM: nVMX: Disable PML in hardware when running L2
  KVM: x86/mmu: Consult max mapping level when zapping collapsible SPTEs
  KVM: x86/mmu: Pass the memslot to the rmap callbacks
  KVM: x86/mmu: Split out max mapping level calculation to helper
  KVM: x86/mmu: Expand collapsible SPTE zap for TDP MMU to ZONE_DEVICE and HugeTLB pages
  KVM: nVMX: no need to undo inject_page_fault change on nested vmexit
  ...
......@@ -4519,6 +4519,7 @@ KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature
leaves (0x40000000, 0x40000001).
Currently, the following list of CPUID leaves are returned:
- HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS
- HYPERV_CPUID_INTERFACE
- HYPERV_CPUID_VERSION
......@@ -4543,6 +4544,7 @@ userspace should not expect to get any particular value there.
Note, vcpu version of KVM_GET_SUPPORTED_HV_CPUID is currently deprecated. Unlike
system ioctl which exposes all supported feature bits unconditionally, vcpu
version has the following quirks:
- HYPERV_CPUID_NESTED_FEATURES leaf and HV_X64_ENLIGHTENED_VMCS_RECOMMENDED
feature bit are only exposed when Enlightened VMCS was previously enabled
on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS).
......
......@@ -591,7 +591,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
} else {
/* Call KVM generic code to do the slow-path check */
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
writing, &write_ok);
writing, &write_ok, NULL);
if (is_error_noslot_pfn(pfn))
return -EFAULT;
page = NULL;
......
......@@ -822,7 +822,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
/* Call KVM generic code to do the slow-path check */
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
writing, upgrade_p);
writing, upgrade_p, NULL);
if (is_error_noslot_pfn(pfn))
return -EFAULT;
page = NULL;
......
......@@ -93,11 +93,7 @@ KVM_X86_OP(check_intercept)
KVM_X86_OP(handle_exit_irqoff)
KVM_X86_OP_NULL(request_immediate_exit)
KVM_X86_OP(sched_in)
KVM_X86_OP_NULL(slot_enable_log_dirty)
KVM_X86_OP_NULL(slot_disable_log_dirty)
KVM_X86_OP_NULL(flush_log_dirty)
KVM_X86_OP_NULL(enable_log_dirty_pt_masked)
KVM_X86_OP_NULL(cpu_dirty_log_size)
KVM_X86_OP_NULL(update_cpu_dirty_logging)
KVM_X86_OP_NULL(pre_block)
KVM_X86_OP_NULL(post_block)
KVM_X86_OP_NULL(vcpu_blocking)
......
......@@ -89,6 +89,8 @@
KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
......@@ -1007,6 +1009,7 @@ struct kvm_arch {
u32 bsp_vcpu_id;
u64 disabled_quirks;
int cpu_dirty_logging_count;
enum kvm_irqchip_mode irqchip_mode;
u8 nr_reserved_ioapic_pins;
......@@ -1271,30 +1274,11 @@ struct kvm_x86_ops {
void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
/*
* Arch-specific dirty logging hooks. These hooks are only supposed to
* be valid if the specific arch has hardware-accelerated dirty logging
* mechanism. Currently only for PML on VMX.
*
* - slot_enable_log_dirty:
* called when enabling log dirty mode for the slot.
* - slot_disable_log_dirty:
* called when disabling log dirty mode for the slot.
* also called when slot is created with log dirty disabled.
* - flush_log_dirty:
* called before reporting dirty_bitmap to userspace.
* - enable_log_dirty_pt_masked:
* called when reenabling log dirty for the GFNs in the mask after
* corresponding bits are cleared in slot->dirty_bitmap.
* Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero
* value indicates CPU dirty logging is unsupported or disabled.
*/
void (*slot_enable_log_dirty)(struct kvm *kvm,
struct kvm_memory_slot *slot);
void (*slot_disable_log_dirty)(struct kvm *kvm,
struct kvm_memory_slot *slot);
void (*flush_log_dirty)(struct kvm *kvm);
void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t offset, unsigned long mask);
int (*cpu_dirty_log_size)(void);
int cpu_dirty_log_size;
void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
......@@ -1437,11 +1421,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot);
void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot);
void kvm_mmu_slot_set_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot);
void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
......@@ -1613,7 +1592,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
void kvm_update_dr7(struct kvm_vcpu *vcpu);
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
......
......@@ -408,7 +408,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_7_0_EBX,
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/
......
......@@ -1165,7 +1165,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep)
* - W bit on ad-disabled SPTEs.
* Returns true iff any D or W bits were cleared.
*/
static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot)
{
u64 *sptep;
struct rmap_iterator iter;
......@@ -1180,35 +1181,6 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
return flush;
}
static bool spte_set_dirty(u64 *sptep)
{
u64 spte = *sptep;
rmap_printk("spte %p %llx\n", sptep, *sptep);
/*
* Similar to the !kvm_x86_ops.slot_disable_log_dirty case,
* do not bother adding back write access to pages marked
* SPTE_AD_WRPROT_ONLY_MASK.
*/
spte |= shadow_dirty_mask;
return mmu_spte_update(sptep, spte);
}
static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
if (spte_ad_enabled(*sptep))
flush |= spte_set_dirty(sptep);
return flush;
}
/**
* kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
* @kvm: kvm instance
......@@ -1248,9 +1220,9 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
*
* Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
*/
void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
struct kvm_rmap_head *rmap_head;
......@@ -1260,13 +1232,12 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PG_LEVEL_4K, slot);
__rmap_clear_dirty(kvm, rmap_head);
__rmap_clear_dirty(kvm, rmap_head, slot);
/* clear the first set bit */
mask &= mask - 1;
}
}
EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
/**
* kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
......@@ -1282,20 +1253,15 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
if (kvm_x86_ops.enable_log_dirty_pt_masked)
static_call(kvm_x86_enable_log_dirty_pt_masked)(kvm, slot,
gfn_offset,
mask);
if (kvm_x86_ops.cpu_dirty_log_size)
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
else
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
int kvm_cpu_dirty_log_size(void)
{
if (kvm_x86_ops.cpu_dirty_log_size)
return static_call(kvm_x86_cpu_dirty_log_size)();
return 0;
return kvm_x86_ops.cpu_dirty_log_size;
}
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
......@@ -1325,7 +1291,8 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
}
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot)
{
u64 *sptep;
struct rmap_iterator iter;
......@@ -1345,7 +1312,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn, int level,
unsigned long data)
{
return kvm_zap_rmapp(kvm, rmap_head);
return kvm_zap_rmapp(kvm, rmap_head, slot);
}
static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
......@@ -2499,7 +2466,21 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa;
int r;
if (vcpu->arch.mmu->direct_map)
return 0;
gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
return r;
}
static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
......@@ -2753,11 +2734,18 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
if (sp->role.level > PG_LEVEL_4K)
return;
/*
* If addresses are being invalidated, skip prefetching to avoid
* accidentally prefetching those addresses.
*/
if (unlikely(vcpu->kvm->mmu_notifier_count))
return;
__direct_pte_prefetch(vcpu, sp, sptep);
}
static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
kvm_pfn_t pfn, struct kvm_memory_slot *slot)
static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
struct kvm_memory_slot *slot)
{
unsigned long hva;
pte_t *pte;
......@@ -2776,19 +2764,36 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
*/
hva = __gfn_to_hva_memslot(slot, gfn);
pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level);
pte = lookup_address_in_mm(kvm->mm, hva, &level);
if (unlikely(!pte))
return PG_LEVEL_4K;
return level;
}
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, kvm_pfn_t pfn, int max_level)
{
struct kvm_lpage_info *linfo;
max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) {
linfo = lpage_info_slot(gfn, slot, max_level);
if (!linfo->disallow_lpage)
break;
}
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
return host_pfn_mapping_level(kvm, gfn, pfn, slot);
}
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
int max_level, kvm_pfn_t *pfnp,
bool huge_page_disallowed, int *req_level)
{
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
kvm_pfn_t pfn = *pfnp;
kvm_pfn_t mask;
int level;
......@@ -2805,17 +2810,7 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
if (!slot)
return PG_LEVEL_4K;
max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) {
linfo = lpage_info_slot(gfn, slot, max_level);
if (!linfo->disallow_lpage)
break;
}
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
level = host_pfn_mapping_level(vcpu, gfn, pfn, slot);
level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
if (level == PG_LEVEL_4K)
return level;
......@@ -3437,7 +3432,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
write_unlock(&vcpu->kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
u32 access, struct x86_exception *exception)
......@@ -3653,8 +3647,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
bool *writable)
gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
bool write, bool *writable)
{
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
bool async;
......@@ -3667,7 +3661,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
}
async = false;
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
write, writable, hva);
if (!async)
return false; /* *pfn has correct page already */
......@@ -3681,7 +3676,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
return true;
}
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
write, writable, hva);
return false;
}
......@@ -3694,6 +3690,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
kvm_pfn_t pfn;
hva_t hva;
int r;
if (page_fault_handle_page_track(vcpu, error_code, gfn))
......@@ -3712,7 +3709,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
write, &map_writable))
return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
......@@ -3725,7 +3723,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
else
write_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
goto out_unlock;
r = make_mmu_pages_available(vcpu);
if (r)
......@@ -5003,22 +5001,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
write_unlock(&vcpu->kvm->mmu_lock);
}
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa;
int r;
if (vcpu->arch.mmu->direct_map)
return 0;
gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len)
{
......@@ -5117,7 +5099,6 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
mmu->invlpg(vcpu, gva, root_hpa);
}
}
EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
......@@ -5157,7 +5138,6 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
* for them.
*/
}
EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
int tdp_huge_page_level)
......@@ -5182,7 +5162,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
/* The return value indicates if tlb flush on all vcpus is needed. */
typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot);
/* The caller should hold mmu-lock before calling this function. */
static __always_inline bool
......@@ -5196,7 +5177,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
end_gfn, &iterator) {
if (iterator.rmap)
flush |= fn(kvm, iterator.rmap);
flush |= fn(kvm, iterator.rmap, memslot);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
if (flush && lock_flush_tlb) {
......@@ -5229,22 +5210,6 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
lock_flush_tlb);
}
static __always_inline bool
slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, bool lock_flush_tlb)
{
return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
}
static __always_inline bool
slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, bool lock_flush_tlb)
{
return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1,
KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
}
static __always_inline bool
slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, bool lock_flush_tlb)
......@@ -5485,7 +5450,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
}
static bool slot_rmap_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head)
struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot)
{
return __rmap_write_protect(kvm, rmap_head, false);
}
......@@ -5519,7 +5485,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
}
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
struct kvm_rmap_head *rmap_head)
struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot)
{
u64 *sptep;
struct rmap_iterator iter;
......@@ -5540,8 +5507,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
* mapping if the indirect sp has level = 1.
*/
if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
(kvm_is_zone_device_pfn(pfn) ||
PageCompound(pfn_to_page(pfn)))) {
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
pfn, PG_LEVEL_NUM)) {
pte_list_remove(rmap_head, sptep);
if (kvm_available_flush_tlb_with_range())
......@@ -5561,12 +5528,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot)
{
/* FIXME: const-ify all uses of struct kvm_memory_slot. */
struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
write_lock(&kvm->mmu_lock);
slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
kvm_mmu_zap_collapsible_spte, true);
slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
write_unlock(&kvm->mmu_lock);
}
......@@ -5605,40 +5573,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
}
EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
bool flush;
write_lock(&kvm->mmu_lock);
flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
false);
if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M);
write_unlock(&kvm->mmu_lock);
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
}
EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
void kvm_mmu_slot_set_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
bool flush;
write_lock(&kvm->mmu_lock);
flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot);
write_unlock(&kvm->mmu_lock);
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
}
EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
void kvm_mmu_zap_all(struct kvm *kvm)
{
......
......@@ -84,7 +84,10 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
* When using the EPT page-modification log, the GPAs in the log
* would come from L2 rather than L1. Therefore, we need to rely
* on write protection to record dirty pages. This also bypasses
* PML, since writes now result in a vmexit.
* PML, since writes now result in a vmexit. Note, this helper will
* tag SPTEs as needing write-protection even if PML is disabled or
* unsupported, but that's ok because the tag is consumed if and only
* if PML is enabled. Omit the PML check to save a few uops.
*/
return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
}
......@@ -138,6 +141,8 @@ enum {
#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
#define SET_SPTE_SPURIOUS BIT(2)
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, kvm_pfn_t pfn, int max_level);
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
int max_level, kvm_pfn_t *pfnp,
bool huge_page_disallowed, int *req_level);
......
......@@ -601,6 +601,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (sp->role.level > PG_LEVEL_4K)
return;
/*
* If addresses are being invalidated, skip prefetching to avoid
* accidentally prefetching those addresses.
*/
if (unlikely(vcpu->kvm->mmu_notifier_count))
return;
if (sp->role.direct)
return __direct_pte_prefetch(vcpu, sp, sptep);
......@@ -790,6 +797,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
struct guest_walker walker;
int r;
kvm_pfn_t pfn;
hva_t hva;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
int max_level;
......@@ -840,8 +848,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
&map_writable))
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
write_fault, &map_writable))
return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
......@@ -869,7 +877,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
goto out_unlock;
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
......
......@@ -1268,68 +1268,16 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
}
}
/*
* Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
* only used for PML, and so will involve setting the dirty bit on each SPTE.
* Returns true if an SPTE has been changed and the TLBs need to be flushed.
*/
static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end)
{
struct tdp_iter iter;
u64 new_spte;
bool spte_set = false;
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
continue;
if (!is_shadow_present_pte(iter.old_spte) ||
iter.old_spte & shadow_dirty_mask)
continue;
new_spte = iter.old_spte | shadow_dirty_mask;
tdp_mmu_set_spte(kvm, &iter, new_spte);
spte_set = true;
}
rcu_read_unlock();
return spte_set;
}
/*
* Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
* only used for PML, and so will involve setting the dirty bit on each SPTE.
* Returns true if an SPTE has been changed and the TLBs need to be flushed.
*/
bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
int root_as_id;
bool spte_set = false;
for_each_tdp_mmu_root_yield_safe(kvm, root) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
continue;
spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
}
return spte_set;
}
/*
* Clear leaf entries which could be replaced by large mappings, for
* GFNs within the slot.
*/
static void zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
gfn_t start, gfn_t end)
struct kvm_memory_slot *slot)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
struct tdp_iter iter;
kvm_pfn_t pfn;
bool spte_set = false;
......@@ -1348,7 +1296,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
pfn = spte_to_pfn(iter.old_spte);
if (kvm_is_reserved_pfn(pfn) ||
!PageTransCompoundMap(pfn_to_page(pfn)))
iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
pfn, PG_LEVEL_NUM))
continue;
tdp_mmu_set_spte(kvm, &iter, 0);
......@@ -1366,7 +1315,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
* be replaced by large mappings, for GFNs within the slot.
*/
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
int root_as_id;
......@@ -1376,8 +1325,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
if (root_as_id != slot->as_id)
continue;
zap_collapsible_spte_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
zap_collapsible_spte_range(kvm, root, slot);
}
}
......
......@@ -33,9 +33,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot);
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot);
struct kvm_memory_slot *slot);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
......
......@@ -51,6 +51,23 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
nested_svm_vmexit(svm);
}
static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
struct vcpu_svm *svm = to_svm(vcpu);
WARN_ON(!is_guest_mode(vcpu));
if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
!svm->nested.nested_run_pending) {
svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
svm->vmcb->control.exit_code_hi = 0;
svm->vmcb->control.exit_info_1 = fault->error_code;
svm->vmcb->control.exit_info_2 = fault->address;
nested_svm_vmexit(svm);
} else {
kvm_inject_page_fault(vcpu, fault);
}
}
static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
{
struct vcpu_svm *svm = to_svm(vcpu);
......@@ -436,16 +453,33 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
{
int ret;
trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
vmcb12->save.rip,
vmcb12->control.int_ctl,
vmcb12->control.event_inj,
vmcb12->control.nested_ctl);
trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
vmcb12->control.intercepts[INTERCEPT_WORD3],
vmcb12->control.intercepts[INTERCEPT_WORD4],
vmcb12->control.intercepts[INTERCEPT_WORD5]);
svm->nested.vmcb12_gpa = vmcb12_gpa;
load_nested_vmcb_control(svm, &vmcb12->control);
nested_prepare_vmcb_save(svm, vmcb12);
nested_prepare_vmcb_control(svm);
nested_prepare_vmcb_save(svm, vmcb12);
ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
nested_npt_enabled(svm));
if (ret)
return ret;
if (!npt_enabled)
svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
svm_set_gif(svm, true);
return 0;
......@@ -489,18 +523,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
goto out;
}
trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
vmcb12->save.rip,
vmcb12->control.int_ctl,
vmcb12->control.event_inj,
vmcb12->control.nested_ctl);
trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
vmcb12->control.intercepts[INTERCEPT_WORD3],
vmcb12->control.intercepts[INTERCEPT_WORD4],
vmcb12->control.intercepts[INTERCEPT_WORD5]);
/* Clear internal status */
kvm_clear_exception_queue(&svm->vcpu);
......
......@@ -926,9 +926,6 @@ static __init void svm_set_cpu_caps(void)
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
/* Enable INVPCID feature */
kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
}
static __init int svm_hardware_setup(void)
......@@ -1103,12 +1100,12 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
static void svm_check_invpcid(struct vcpu_svm *svm)
{
/*
* Intercept INVPCID instruction only if shadow page table is
* enabled. Interception is not required with nested page table
* enabled.
* Intercept INVPCID if shadow paging is enabled to sync/free shadow
* roots, or if INVPCID is disabled in the guest to inject #UD.
*/
if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
if (!npt_enabled)
if (!npt_enabled ||
!guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
svm_set_intercept(svm, INTERCEPT_INVPCID);
else
svm_clr_intercept(svm, INTERCEPT_INVPCID);
......@@ -2214,15 +2211,20 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
[SVM_INSTR_VMSAVE] = vmsave_interception,
};
struct vcpu_svm *svm = to_svm(vcpu);
int ret;
if (is_guest_mode(vcpu)) {
svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
svm->vmcb->control.exit_info_1 = 0;
svm->vmcb->control.exit_info_2 = 0;
return nested_svm_vmexit(svm);
} else
return svm_instr_handlers[opcode](svm);
/* Returns '1' or -errno on failure, '0' on success. */
ret = nested_svm_vmexit(svm);
if (ret)
return ret;
return 1;
}
return svm_instr_handlers[opcode](svm);
}
/*
......
......@@ -2167,15 +2167,13 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
/*
* The PML address never changes, so it is constant in vmcs02.
* Conceptually we want to copy the PML index from vmcs01 here,
* and then back to vmcs01 on nested vmexit. But since we flush
* the log and reset GUEST_PML_INDEX on each vmexit, the PML
* index is also effectively constant in vmcs02.
* PML is emulated for L2, but never enabled in hardware as the MMU
* handles A/D emulation. Disabling PML for L2 also avoids having to
* deal with filtering out L2 GPAs from the buffer.
*/
if (enable_pml) {
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
vmcs_write64(PML_ADDRESS, 0);
vmcs_write16(GUEST_PML_INDEX, -1);
}
if (cpu_has_vmx_encls_vmexit())
......@@ -2210,7 +2208,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
{
u32 exec_control, vmcs12_exec_ctrl;
u32 exec_control;
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
......@@ -2284,11 +2282,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_ENABLE_VMFUNC);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
~SECONDARY_EXEC_ENABLE_PML;
exec_control |= vmcs12_exec_ctrl;
}
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
/* PML is emulated and never enabled in hardware for L2. */
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
/* VMCS shadowing for L2 is emulated for now */
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
......@@ -4200,9 +4198,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
......@@ -4495,6 +4490,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx_set_virtual_apic_mode(vcpu);
}
if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
vmx->nested.update_vmcs01_cpu_dirty_logging = false;
vmx_update_cpu_dirty_logging(vcpu);
}
/* Unpin physical memory we referred to in vmcs02 */
if (vmx->nested.apic_access_page) {
kvm_release_page_clean(vmx->nested.apic_access_page);
......@@ -5793,7 +5793,10 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
case EXIT_REASON_PREEMPTION_TIMER:
return true;
case EXIT_REASON_PML_FULL:
/* We emulate PML support to L1. */
/*
* PML is emulated for an L1 VMM and should never be enabled in
* vmcs02, always "handle" PML_FULL by exiting to userspace.
*/
return true;
case EXIT_REASON_VMFUNC:
/* VM functions are emulated through L2->L0 vmexits. */
......
......@@ -298,7 +298,7 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
if (IS_ERR(event)) {
pr_debug_ratelimited("%s: failed %ld\n",
__func__, PTR_ERR(event));
return -ENOENT;
return PTR_ERR(event);
}
lbr_desc->event = event;
pmu->event_count++;
......@@ -320,7 +320,7 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
if (!intel_pmu_is_valid_lbr_msr(vcpu, index))
return false;
if (!lbr_desc->event && !intel_pmu_create_guest_lbr_event(vcpu))
if (!lbr_desc->event && intel_pmu_create_guest_lbr_event(vcpu) < 0)
goto dummy;
/*
......
......@@ -4277,7 +4277,12 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
*/
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
if (!enable_pml)
/*
* PML is enabled/disabled when dirty logging of memsmlots changes, but
* it needs to be set here when dirty logging is already active, e.g.
* if this vCPU was created after dirty logging was enabled.
*/
if (!vcpu->kvm->arch.cpu_dirty_logging_count)
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
if (cpu_has_vmx_xsaves()) {
......@@ -4295,18 +4300,8 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
}
vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
/*
* Expose INVPCID if and only if PCID is also exposed to the guest.
* INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF
* if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect
* behavior from the guest perspective (it would expect #GP or #PF).
*/
if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
......@@ -5776,24 +5771,6 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
/*
* Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
* Called before reporting dirty_bitmap to userspace.
*/
static void kvm_flush_pml_buffers(struct kvm *kvm)
{
int i;
struct kvm_vcpu *vcpu;
/*
* We only need to kick vcpu out of guest mode here, as PML buffer
* is flushed at beginning of all VMEXITs, and it's obvious that only
* vcpus running in guest are possible to have unflushed GPAs in PML
* buffer.
*/
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vcpu_kick(vcpu);
}
static void vmx_dump_sel(char *name, uint32_t sel)
{
pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
......@@ -5976,9 +5953,10 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
* updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
* querying dirty_bitmap, we only need to kick all vcpus out of guest
* mode as if vcpus is in root mode, the PML buffer must has been
* flushed already.
* flushed already. Note, PML is never enabled in hardware while
* running L2.
*/
if (enable_pml)
if (enable_pml && !is_guest_mode(vcpu))
vmx_flush_pml_buffer(vcpu);
/*
......@@ -5994,6 +5972,13 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return handle_invalid_guest_state(vcpu);
if (is_guest_mode(vcpu)) {
/*
* PML is never enabled when running L2, bail immediately if a
* PML full exit occurs as something is horribly wrong.
*/
if (exit_reason.basic == EXIT_REASON_PML_FULL)
goto unexpected_vmexit;
/*
* The host physical addresses of some pages of guest memory
* are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
......@@ -6851,13 +6836,15 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
if (likely(!vmx->exit_reason.failed_vmentry))
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
if (unlikely(vmx->exit_reason.failed_vmentry))
return EXIT_FASTPATH_NONE;
vmx->loaded_vmcs->launched = 1;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
......@@ -7330,8 +7317,8 @@ static __init void vmx_set_cpu_caps(void)
/* CPUID 0x7 */
if (kvm_mpx_supported())
kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
if (cpu_has_vmx_invpcid())
kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
if (!cpu_has_vmx_invpcid())
kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
if (vmx_pt_mode_is_host_guest())
kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
......@@ -7509,30 +7496,24 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
shrink_ple_window(vcpu);
}
static void vmx_slot_enable_log_dirty(struct kvm *kvm,
struct kvm_memory_slot *slot)
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
}
static void vmx_slot_disable_log_dirty(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
kvm_mmu_slot_set_dirty(kvm, slot);
}
struct vcpu_vmx *vmx = to_vmx(vcpu);
static void vmx_flush_log_dirty(struct kvm *kvm)
{
kvm_flush_pml_buffers(kvm);
}
if (is_guest_mode(vcpu)) {
vmx->nested.update_vmcs01_cpu_dirty_logging = true;
return;
}
static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *memslot,
gfn_t offset, unsigned long mask)
{
kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
/*
* Note, cpu_dirty_logging_count can be changed concurrent with this
* code, but in that case another update request will be made and so
* the guest will never run with a stale PML value.
*/
if (vcpu->kvm->arch.cpu_dirty_logging_count)
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
else
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
static int vmx_pre_block(struct kvm_vcpu *vcpu)
......@@ -7642,11 +7623,6 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
return supported & BIT(bit);
}
static int vmx_cpu_dirty_log_size(void)
{
return enable_pml ? PML_ENTITY_NUM : 0;
}
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.hardware_unsetup = hardware_unsetup,
......@@ -7746,10 +7722,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.sched_in = vmx_sched_in,
.slot_enable_log_dirty = vmx_slot_enable_log_dirty,
.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
.flush_log_dirty = vmx_flush_log_dirty,
.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
.cpu_dirty_log_size = PML_ENTITY_NUM,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
.pre_block = vmx_pre_block,
.post_block = vmx_post_block,
......@@ -7777,7 +7751,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.msr_filter_changed = vmx_msr_filter_changed,
.complete_emulated_msr = kvm_complete_insn_gp,
.cpu_dirty_log_size = vmx_cpu_dirty_log_size,
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
};
......@@ -7894,13 +7867,8 @@ static __init int hardware_setup(void)
if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
enable_pml = 0;
if (!enable_pml) {
vmx_x86_ops.slot_enable_log_dirty = NULL;
vmx_x86_ops.slot_disable_log_dirty = NULL;
vmx_x86_ops.flush_log_dirty = NULL;
vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
vmx_x86_ops.cpu_dirty_log_size = NULL;
}
if (!enable_pml)
vmx_x86_ops.cpu_dirty_log_size = 0;
if (!cpu_has_vmx_preemption_timer())
enable_preemption_timer = false;
......
......@@ -165,6 +165,7 @@ struct nested_vmx {
bool change_vmcs01_virtual_apic_mode;
bool reload_vmcs01_apic_access_page;
bool update_vmcs01_cpu_dirty_logging;
/*
* Enlightened VMCS has been enabled. It does not mean that L1 has to
......@@ -393,6 +394,7 @@ int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type, bool value);
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
static inline u8 vmx_get_rvi(void)
{
......
......@@ -5215,10 +5215,18 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
/*
* Flush potentially hardware-cached dirty pages to dirty_bitmap.
* Flush all CPUs' dirty log buffers to the dirty_bitmap. Called
* before reporting dirty_bitmap to userspace. KVM flushes the buffers
* on all VM-Exits, thus we only need to kick running vCPUs to force a
* VM-Exit.
*/
static_call_cond(kvm_x86_flush_log_dirty)(kvm);
struct kvm_vcpu *vcpu;
int i;
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vcpu_kick(vcpu);
}
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
......@@ -8980,6 +8988,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_check_async_pf_completion(vcpu);
if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
static_call(kvm_x86_msr_filter_changed)(vcpu);
if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
......@@ -10748,75 +10759,96 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
return 0;
}
static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
{
struct kvm_arch *ka = &kvm->arch;
if (!kvm_x86_ops.cpu_dirty_log_size)
return;
if ((enable && ++ka->cpu_dirty_logging_count == 1) ||
(!enable && --ka->cpu_dirty_logging_count == 0))
kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING);
WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0);
}
static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
struct kvm_memory_slot *old,
struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES;
/*
* Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot.
* See comments below.
* Update CPU dirty logging if dirty logging is being toggled. This
* applies to all operations.
*/
if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
return;
if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)
kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
/*
* Dirty logging tracks sptes in 4k granularity, meaning that large
* sptes have to be split. If live migration is successful, the guest
* in the source machine will be destroyed and large sptes will be
* created in the destination. However, if the guest continues to run
* in the source machine (for example if live migration fails), small
* sptes will remain around and cause bad performance.
*
* Scan sptes if dirty logging has been stopped, dropping those
* which can be collapsed into a single large-page spte. Later
* page faults will create the large-page sptes.
* Nothing more to do for RO slots (which can't be dirtied and can't be
* made writable) or CREATE/MOVE/DELETE of a slot.
*
* There is no need to do this in any of the following cases:
* For a memslot with dirty logging disabled:
* CREATE: No dirty mappings will already exist.
* MOVE/DELETE: The old mappings will already have been cleaned up by
* kvm_arch_flush_shadow_memslot()
*
* For a memslot with dirty logging enabled:
* CREATE: No shadow pages exist, thus nothing to write-protect
* and no dirty bits to clear.
* MOVE/DELETE: The old mappings will already have been cleaned up by
* kvm_arch_flush_shadow_memslot().
*/
if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
kvm_mmu_zap_collapsible_sptes(kvm, new);
if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
return;
/*
* Enable or disable dirty logging for the slot.
*
* For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old
* slot have been zapped so no dirty logging updates are needed for
* the old slot.
* For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible
* any mappings that might be created in it will consume the
* properties of the new slot and do not need to be updated here.
*
* When PML is enabled, the kvm_x86_ops dirty logging hooks are
* called to enable/disable dirty logging.
*
* When disabling dirty logging with PML enabled, the D-bit is set
* for sptes in the slot in order to prevent unnecessary GPA
* logging in the PML buffer (and potential PML buffer full VMEXIT).
* This guarantees leaving PML enabled for the guest's lifetime
* won't have any additional overhead from PML when the guest is
* running with dirty logging disabled.
*
* When enabling dirty logging, large sptes are write-protected
* so they can be split on first write. New large sptes cannot
* be created for this slot until the end of the logging.
* See the comments in fast_page_fault().
* For small sptes, nothing is done if the dirty log is in the
* initial-all-set state. Otherwise, depending on whether pml
* is enabled the D-bit or the W-bit will be cleared.
* READONLY and non-flags changes were filtered out above, and the only
* other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
* logging isn't being toggled on or off.
*/
if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
if (kvm_x86_ops.slot_enable_log_dirty) {
static_call(kvm_x86_slot_enable_log_dirty)(kvm, new);
} else {
int level =
kvm_dirty_log_manual_protect_and_init_set(kvm) ?
PG_LEVEL_2M : PG_LEVEL_4K;
if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)))
return;
if (!log_dirty_pages) {
/*
* Dirty logging tracks sptes in 4k granularity, meaning that
* large sptes have to be split. If live migration succeeds,
* the guest in the source machine will be destroyed and large
* sptes will be created in the destination. However, if the
* guest continues to run in the source machine (for example if
* live migration fails), small sptes will remain around and
* cause bad performance.
*
* Scan sptes if dirty logging has been stopped, dropping those
* which can be collapsed into a single large-page spte. Later
* page faults will create the large-page sptes.
*/
kvm_mmu_zap_collapsible_sptes(kvm, new);
} else {
/* By default, write-protect everything to log writes. */
int level = PG_LEVEL_4K;
if (kvm_x86_ops.cpu_dirty_log_size) {
/*
* Clear all dirty bits, unless pages are treated as
* dirty from the get-go.
*/
if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
kvm_mmu_slot_leaf_clear_dirty(kvm, new);
/*
* Write-protect large pages on write so that dirty
* logging happens at 4k granularity. No need to
* write-protect small SPTEs since write accesses are
* logged by the CPU via dirty bits.
*/
level = PG_LEVEL_2M;
} else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
/*
* If we're with initial-all-set, we don't need
* to write protect any small page because
......@@ -10825,10 +10857,9 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
* so that the page split can happen lazily on
* the first write to the huge page.
*/
kvm_mmu_slot_remove_write_access(kvm, new, level);
level = PG_LEVEL_2M;
}
} else {
static_call_cond(kvm_x86_slot_disable_log_dirty)(kvm, new);
kvm_mmu_slot_remove_write_access(kvm, new, level);
}
}
......
......@@ -11,6 +11,7 @@
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/bug.h>
#include <linux/minmax.h>
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
#include <linux/preempt.h>
......@@ -506,6 +507,8 @@ struct kvm {
struct mmu_notifier mmu_notifier;
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
unsigned long mmu_notifier_range_start;
unsigned long mmu_notifier_range_end;
#endif
long tlbs_dirty;
struct list_head devices;
......@@ -733,7 +736,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
bool atomic, bool *async, bool write_fault,
bool *writable);
bool *writable, hva_t *hva);
void kvm_release_pfn_clean(kvm_pfn_t pfn);
void kvm_release_pfn_dirty(kvm_pfn_t pfn);
......@@ -1207,6 +1210,26 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
return 1;
return 0;
}
static inline int mmu_notifier_retry_hva(struct kvm *kvm,
unsigned long mmu_seq,
unsigned long hva)
{
lockdep_assert_held(&kvm->mmu_lock);
/*
* If mmu_notifier_count is non-zero, then the range maintained by
* kvm_mmu_notifier_invalidate_range_start contains all addresses that
* might be being invalidated. Note that it may include some false
* positives, due to shortcuts when handing concurrent invalidations.
*/
if (unlikely(kvm->mmu_notifier_count) &&
hva >= kvm->mmu_notifier_range_start &&
hva < kvm->mmu_notifier_range_end)
return 1;
if (kvm->mmu_notifier_seq != mmu_seq)
return 1;
return 0;
}
#endif
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
......
......@@ -33,6 +33,7 @@
/demand_paging_test
/dirty_log_test
/dirty_log_perf_test
/hardware_disable_test
/kvm_create_max_vcpus
/memslot_modification_stress_test
/set_memory_region_test
......
......@@ -67,6 +67,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
TEST_GEN_PROGS_x86_64 += demand_paging_test
TEST_GEN_PROGS_x86_64 += dirty_log_test
TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
TEST_GEN_PROGS_x86_64 += hardware_disable_test
TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
TEST_GEN_PROGS_x86_64 += set_memory_region_test
......
// SPDX-License-Identifier: GPL-2.0-only
/*
* This test is intended to reproduce a crash that happens when
* kvm_arch_hardware_disable is called and it attempts to unregister the user
* return notifiers.
*/
#define _GNU_SOURCE
#include <fcntl.h>
#include <pthread.h>
#include <semaphore.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/wait.h>
#include <test_util.h>
#include "kvm_util.h"
#define VCPU_NUM 4
#define SLEEPING_THREAD_NUM (1 << 4)
#define FORK_NUM (1ULL << 9)
#define DELAY_US_MAX 2000
#define GUEST_CODE_PIO_PORT 4
sem_t *sem;
/* Arguments for the pthreads */
struct payload {
struct kvm_vm *vm;
uint32_t index;
};
static void guest_code(void)
{
for (;;)
; /* Some busy work */
printf("Should not be reached.\n");
}
static void *run_vcpu(void *arg)
{
struct payload *payload = (struct payload *)arg;
struct kvm_run *state = vcpu_state(payload->vm, payload->index);
vcpu_run(payload->vm, payload->index);
TEST_ASSERT(false, "%s: exited with reason %d: %s\n",
__func__, state->exit_reason,
exit_reason_str(state->exit_reason));
pthread_exit(NULL);
}
static void *sleeping_thread(void *arg)
{
int fd;
while (true) {
fd = open("/dev/null", O_RDWR);
close(fd);
}
TEST_ASSERT(false, "%s: exited\n", __func__);
pthread_exit(NULL);
}
static inline void check_create_thread(pthread_t *thread, pthread_attr_t *attr,
void *(*f)(void *), void *arg)
{
int r;
r = pthread_create(thread, attr, f, arg);
TEST_ASSERT(r == 0, "%s: failed to create thread", __func__);
}
static inline void check_set_affinity(pthread_t thread, cpu_set_t *cpu_set)
{
int r;
r = pthread_setaffinity_np(thread, sizeof(cpu_set_t), cpu_set);
TEST_ASSERT(r == 0, "%s: failed set affinity", __func__);
}
static inline void check_join(pthread_t thread, void **retval)
{
int r;
r = pthread_join(thread, retval);
TEST_ASSERT(r == 0, "%s: failed to join thread", __func__);
}
static void run_test(uint32_t run)
{
struct kvm_vm *vm;
cpu_set_t cpu_set;
pthread_t threads[VCPU_NUM];
pthread_t throw_away;
struct payload payloads[VCPU_NUM];
void *b;
uint32_t i, j;
CPU_ZERO(&cpu_set);
for (i = 0; i < VCPU_NUM; i++)
CPU_SET(i, &cpu_set);
vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
vm_create_irqchip(vm);
fprintf(stderr, "%s: [%d] start vcpus\n", __func__, run);
for (i = 0; i < VCPU_NUM; ++i) {
vm_vcpu_add_default(vm, i, guest_code);
payloads[i].vm = vm;
payloads[i].index = i;
check_create_thread(&threads[i], NULL, run_vcpu,
(void *)&payloads[i]);
check_set_affinity(threads[i], &cpu_set);
for (j = 0; j < SLEEPING_THREAD_NUM; ++j) {
check_create_thread(&throw_away, NULL, sleeping_thread,
(void *)NULL);
check_set_affinity(throw_away, &cpu_set);
}
}
fprintf(stderr, "%s: [%d] all threads launched\n", __func__, run);
sem_post(sem);
for (i = 0; i < VCPU_NUM; ++i)
check_join(threads[i], &b);
/* Should not be reached */
TEST_ASSERT(false, "%s: [%d] child escaped the ninja\n", __func__, run);
}
int main(int argc, char **argv)
{
uint32_t i;
int s, r;
pid_t pid;
sem = sem_open("vm_sem", O_CREAT | O_EXCL, 0644, 0);
sem_unlink("vm_sem");
for (i = 0; i < FORK_NUM; ++i) {
pid = fork();
TEST_ASSERT(pid >= 0, "%s: unable to fork", __func__);
if (pid == 0)
run_test(i); /* This function always exits */
fprintf(stderr, "%s: [%d] waiting semaphore\n", __func__, i);
sem_wait(sem);
r = (rand() % DELAY_US_MAX) + 1;
fprintf(stderr, "%s: [%d] waiting %dus\n", __func__, i, r);
usleep(r);
r = waitpid(pid, &s, WNOHANG);
TEST_ASSERT(r != pid,
"%s: [%d] child exited unexpectedly status: [%d]",
__func__, i, s);
fprintf(stderr, "%s: [%d] killing child\n", __func__, i);
kill(pid, SIGKILL);
}
sem_destroy(sem);
exit(0);
}
......@@ -720,7 +720,8 @@ struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid)
{
struct vcpu *vcpu = vcpu_find(vm, vcpuid);
struct kvm_cpuid2 *cpuid;
int rc, max_ent;
int max_ent;
int rc = -1;
TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
......
......@@ -486,6 +486,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
if (likely(kvm->mmu_notifier_count == 1)) {
kvm->mmu_notifier_range_start = range->start;
kvm->mmu_notifier_range_end = range->end;
} else {
/*
* Fully tracking multiple concurrent ranges has dimishing
* returns. Keep things simple and just find the minimal range
* which includes the current and new ranges. As there won't be
* enough information to subtract a range after its invalidate
* completes, any ranges invalidated concurrently will
* accumulate and persist until all outstanding invalidates
* complete.
*/
kvm->mmu_notifier_range_start =
min(kvm->mmu_notifier_range_start, range->start);
kvm->mmu_notifier_range_end =
max(kvm->mmu_notifier_range_end, range->end);
}
need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
range->flags);
/* we've to flush the tlb before the pages can be freed */
......@@ -2023,10 +2041,13 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
bool atomic, bool *async, bool write_fault,
bool *writable)
bool *writable, hva_t *hva)
{
unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
if (hva)
*hva = addr;
if (addr == KVM_HVA_ERR_RO_BAD) {
if (writable)
*writable = false;
......@@ -2054,19 +2075,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable)
{
return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
write_fault, writable);
write_fault, writable, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
{
return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
{
return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册