提交 3bba8056 编写于 作者: A Anup Patel 提交者: Xie XiuQi

RISC-V: KVM: Implement MMU notifiers

euleros inclusion
category: feature
feature: initial KVM RISC-V support
bugzilla: 46845
CVE: NA

This patch implements MMU notifiers for KVM RISC-V so that Guest
physical address space is in-sync with Host physical address space.

This will allow swapping, page migration, etc to work transparently
with KVM RISC-V.

Reference: https://gitee.com/openeuler/kernel/issues/I26X9VSigned-off-by: NAnup Patel <anup.patel@wdc.com>
Acked-by: NPaolo Bonzini <pbonzini@redhat.com>
Reviewed-by: NPaolo Bonzini <pbonzini@redhat.com>
Reviewed-by: NAlexander Graf <graf@amazon.com>
Reviewed-by: NYifei Jiang <jiangyifei@huawei.com>
Acked-by: NXie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: NChen Jun <chenjun102@huawei.com>
上级 4c99be36
...@@ -203,6 +203,13 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} ...@@ -203,6 +203,13 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start,
unsigned long end, unsigned int flags);
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void __kvm_riscv_hfence_gvma_vmid_gpa(unsigned long gpa, unsigned long vmid); void __kvm_riscv_hfence_gvma_vmid_gpa(unsigned long gpa, unsigned long vmid);
void __kvm_riscv_hfence_gvma_vmid(unsigned long vmid); void __kvm_riscv_hfence_gvma_vmid(unsigned long vmid);
void __kvm_riscv_hfence_gvma_gpa(unsigned long gpa); void __kvm_riscv_hfence_gvma_gpa(unsigned long gpa);
......
...@@ -20,6 +20,7 @@ if VIRTUALIZATION ...@@ -20,6 +20,7 @@ if VIRTUALIZATION
config KVM config KVM
tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)" tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)"
depends on RISCV_SBI && MMU depends on RISCV_SBI && MMU
select MMU_NOTIFIER
select PREEMPT_NOTIFIERS select PREEMPT_NOTIFIERS
select ANON_INODES select ANON_INODES
select KVM_MMIO select KVM_MMIO
......
...@@ -296,7 +296,8 @@ static void stage2_op_pte(struct kvm *kvm, gpa_t addr, ...@@ -296,7 +296,8 @@ static void stage2_op_pte(struct kvm *kvm, gpa_t addr,
} }
} }
static void stage2_unmap_range(struct kvm *kvm, gpa_t start, gpa_t size) static void stage2_unmap_range(struct kvm *kvm, gpa_t start,
gpa_t size, bool may_block)
{ {
int ret; int ret;
pte_t *ptep; pte_t *ptep;
...@@ -321,6 +322,13 @@ static void stage2_unmap_range(struct kvm *kvm, gpa_t start, gpa_t size) ...@@ -321,6 +322,13 @@ static void stage2_unmap_range(struct kvm *kvm, gpa_t start, gpa_t size)
next: next:
addr += page_size; addr += page_size;
/*
* If the range is too large, release the kvm->mmu_lock
* to prevent starvation and lockup detector warnings.
*/
if (may_block && addr < end)
cond_resched_lock(&kvm->mmu_lock);
} }
} }
...@@ -404,6 +412,38 @@ int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, ...@@ -404,6 +412,38 @@ int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
} }
static int handle_hva_to_gpa(struct kvm *kvm,
unsigned long start,
unsigned long end,
int (*handler)(struct kvm *kvm,
gpa_t gpa, u64 size,
void *data),
void *data)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int ret = 0;
slots = kvm_memslots(kvm);
/* we only care about the pages that the guest sees */
kvm_for_each_memslot(memslot, slots) {
unsigned long hva_start, hva_end;
gfn_t gpa;
hva_start = max(start, memslot->userspace_addr);
hva_end = min(end, memslot->userspace_addr +
(memslot->npages << PAGE_SHIFT));
if (hva_start >= hva_end)
continue;
gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
}
return ret;
}
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot, struct kvm_memory_slot *slot,
gfn_t gfn_offset, gfn_t gfn_offset,
...@@ -549,7 +589,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, ...@@ -549,7 +589,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
spin_lock(&kvm->mmu_lock); spin_lock(&kvm->mmu_lock);
if (ret) if (ret)
stage2_unmap_range(kvm, mem->guest_phys_addr, stage2_unmap_range(kvm, mem->guest_phys_addr,
mem->memory_size); mem->memory_size, false);
spin_unlock(&kvm->mmu_lock); spin_unlock(&kvm->mmu_lock);
out: out:
...@@ -557,6 +597,96 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, ...@@ -557,6 +597,96 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
return ret; return ret;
} }
static int kvm_unmap_hva_handler(struct kvm *kvm,
gpa_t gpa, u64 size, void *data)
{
unsigned int flags = *(unsigned int *)data;
bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
stage2_unmap_range(kvm, gpa, size, may_block);
return 0;
}
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start,
unsigned long end, unsigned int flags)
{
if (!kvm->arch.pgd)
return 0;
handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
return 0;
}
static int kvm_set_spte_handler(struct kvm *kvm,
gpa_t gpa, u64 size, void *data)
{
pte_t *pte = (pte_t *)data;
WARN_ON(size != PAGE_SIZE);
stage2_set_pte(kvm, 0, NULL, gpa, pte);
return 0;
}
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
unsigned long end = hva + PAGE_SIZE;
kvm_pfn_t pfn = pte_pfn(pte);
pte_t stage2_pte;
if (!kvm->arch.pgd)
return 0;
stage2_pte = pfn_pte(pfn, PAGE_WRITE_EXEC);
handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
return 0;
}
static int kvm_age_hva_handler(struct kvm *kvm,
gpa_t gpa, u64 size, void *data)
{
pte_t *ptep;
u32 ptep_level = 0;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE);
if (!stage2_get_leaf_entry(kvm, gpa, &ptep, &ptep_level))
return 0;
return ptep_test_and_clear_young(NULL, 0, ptep);
}
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
{
if (!kvm->arch.pgd)
return 0;
return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
}
static int kvm_test_age_hva_handler(struct kvm *kvm,
gpa_t gpa, u64 size, void *data)
{
pte_t *ptep;
u32 ptep_level = 0;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
if (!stage2_get_leaf_entry(kvm, gpa, &ptep, &ptep_level))
return 0;
return pte_young(*ptep);
}
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
{
if (!kvm->arch.pgd)
return 0;
return handle_hva_to_gpa(kvm, hva, hva,
kvm_test_age_hva_handler, NULL);
}
int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
struct kvm_memory_slot *memslot, struct kvm_memory_slot *memslot,
gpa_t gpa, unsigned long hva, gpa_t gpa, unsigned long hva,
...@@ -571,7 +701,7 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, ...@@ -571,7 +701,7 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
struct kvm_mmu_page_cache *pcache = &vcpu->arch.mmu_page_cache; struct kvm_mmu_page_cache *pcache = &vcpu->arch.mmu_page_cache;
bool logging = (memslot->dirty_bitmap && bool logging = (memslot->dirty_bitmap &&
!(memslot->flags & KVM_MEM_READONLY)) ? true : false; !(memslot->flags & KVM_MEM_READONLY)) ? true : false;
unsigned long vma_pagesize; unsigned long vma_pagesize, mmu_seq;
mmap_read_lock(current->mm); mmap_read_lock(current->mm);
...@@ -610,6 +740,8 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, ...@@ -610,6 +740,8 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
return ret; return ret;
} }
mmu_seq = kvm->mmu_notifier_seq;
hfn = gfn_to_pfn_prot(kvm, gfn, is_write, NULL); hfn = gfn_to_pfn_prot(kvm, gfn, is_write, NULL);
if (hfn == KVM_PFN_ERR_HWPOISON) { if (hfn == KVM_PFN_ERR_HWPOISON) {
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva,
...@@ -628,6 +760,9 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, ...@@ -628,6 +760,9 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
spin_lock(&kvm->mmu_lock); spin_lock(&kvm->mmu_lock);
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
if (writeable) { if (writeable) {
kvm_set_pfn_dirty(hfn); kvm_set_pfn_dirty(hfn);
mark_page_dirty(kvm, gfn); mark_page_dirty(kvm, gfn);
...@@ -641,6 +776,7 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, ...@@ -641,6 +776,7 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
if (ret) if (ret)
kvm_err("Failed to map in stage2\n"); kvm_err("Failed to map in stage2\n");
out_unlock:
spin_unlock(&kvm->mmu_lock); spin_unlock(&kvm->mmu_lock);
kvm_set_pfn_accessed(hfn); kvm_set_pfn_accessed(hfn);
kvm_release_pfn_clean(hfn); kvm_release_pfn_clean(hfn);
...@@ -677,7 +813,7 @@ void kvm_riscv_stage2_free_pgd(struct kvm *kvm) ...@@ -677,7 +813,7 @@ void kvm_riscv_stage2_free_pgd(struct kvm *kvm)
spin_lock(&kvm->mmu_lock); spin_lock(&kvm->mmu_lock);
if (kvm->arch.pgd) { if (kvm->arch.pgd) {
stage2_unmap_range(kvm, 0UL, stage2_gpa_size); stage2_unmap_range(kvm, 0UL, stage2_gpa_size, false);
pgd = READ_ONCE(kvm->arch.pgd); pgd = READ_ONCE(kvm->arch.pgd);
kvm->arch.pgd = NULL; kvm->arch.pgd = NULL;
kvm->arch.pgd_phys = 0; kvm->arch.pgd_phys = 0;
......
...@@ -49,6 +49,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) ...@@ -49,6 +49,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD:
case KVM_CAP_DEVICE_CTRL: case KVM_CAP_DEVICE_CTRL:
case KVM_CAP_USER_MEMORY: case KVM_CAP_USER_MEMORY:
case KVM_CAP_SYNC_MMU:
case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
case KVM_CAP_ONE_REG: case KVM_CAP_ONE_REG:
case KVM_CAP_READONLY_MEM: case KVM_CAP_READONLY_MEM:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册