提交 afbb1b1c 编写于 作者: M Marc Zyngier

Merge branch kvm-arm64/s1ptw-write-fault into kvmarm-master/fixes

* kvm-arm64/s1ptw-write-fault:
  : .
  : Fix S1PTW fault handling that was until then always taken
  : as a write. From the cover letter:
  :
  : `Recent developments on the EFI front have resulted in guests that
  : simply won't boot if the page tables are in a read-only memslot and
  : that you're a bit unlucky in the way S2 gets paged in... The core
  : issue is related to the fact that we treat a S1PTW as a write, which
  : is close enough to what needs to be done. Until to get to RO memslots.
  :
  : The first patch fixes this and is definitely a stable candidate. It
  : splits the faulting of page tables in two steps (RO translation fault,
  : followed by a writable permission fault -- should it even happen).
  : The second one documents the slightly odd behaviour of PTW writes to
  : RO memslot, which do not result in a KVM_MMIO exit. The last patch is
  : totally optional, only tangentially related, and randomly repainting
  : stuff (maybe that's contagious, who knows)."
  :
  : .
  KVM: arm64: Convert FSC_* over to ESR_ELx_FSC_*
  KVM: arm64: Document the behaviour of S1PTW faults on RO memslots
  KVM: arm64: Fix S1PTW handling on RO memslots
Signed-off-by: NMarc Zyngier <maz@kernel.org>
...@@ -1354,6 +1354,14 @@ the memory region are automatically reflected into the guest. For example, an ...@@ -1354,6 +1354,14 @@ the memory region are automatically reflected into the guest. For example, an
mmap() that affects the region will be made visible immediately. Another mmap() that affects the region will be made visible immediately. Another
example is madvise(MADV_DROP). example is madvise(MADV_DROP).
Note: On arm64, a write generated by the page-table walker (to update
the Access and Dirty flags, for example) never results in a
KVM_EXIT_MMIO exit when the slot has the KVM_MEM_READONLY flag. This
is because KVM cannot provide the data that would be written by the
page-table walker, making it impossible to emulate the access.
Instead, an abort (data abort if the cause of the page-table update
was a load or a store, instruction abort if it was an instruction
fetch) is injected in the guest.
4.36 KVM_SET_TSS_ADDR 4.36 KVM_SET_TSS_ADDR
--------------------- ---------------------
......
...@@ -114,6 +114,15 @@ ...@@ -114,6 +114,15 @@
#define ESR_ELx_FSC_ACCESS (0x08) #define ESR_ELx_FSC_ACCESS (0x08)
#define ESR_ELx_FSC_FAULT (0x04) #define ESR_ELx_FSC_FAULT (0x04)
#define ESR_ELx_FSC_PERM (0x0C) #define ESR_ELx_FSC_PERM (0x0C)
#define ESR_ELx_FSC_SEA_TTW0 (0x14)
#define ESR_ELx_FSC_SEA_TTW1 (0x15)
#define ESR_ELx_FSC_SEA_TTW2 (0x16)
#define ESR_ELx_FSC_SEA_TTW3 (0x17)
#define ESR_ELx_FSC_SECC (0x18)
#define ESR_ELx_FSC_SECC_TTW0 (0x1c)
#define ESR_ELx_FSC_SECC_TTW1 (0x1d)
#define ESR_ELx_FSC_SECC_TTW2 (0x1e)
#define ESR_ELx_FSC_SECC_TTW3 (0x1f)
/* ISS field definitions for Data Aborts */ /* ISS field definitions for Data Aborts */
#define ESR_ELx_ISV_SHIFT (24) #define ESR_ELx_ISV_SHIFT (24)
......
...@@ -319,21 +319,6 @@ ...@@ -319,21 +319,6 @@
BIT(18) | \ BIT(18) | \
GENMASK(16, 15)) GENMASK(16, 15))
/* For compatibility with fault code shared with 32-bit */
#define FSC_FAULT ESR_ELx_FSC_FAULT
#define FSC_ACCESS ESR_ELx_FSC_ACCESS
#define FSC_PERM ESR_ELx_FSC_PERM
#define FSC_SEA ESR_ELx_FSC_EXTABT
#define FSC_SEA_TTW0 (0x14)
#define FSC_SEA_TTW1 (0x15)
#define FSC_SEA_TTW2 (0x16)
#define FSC_SEA_TTW3 (0x17)
#define FSC_SECC (0x18)
#define FSC_SECC_TTW0 (0x1c)
#define FSC_SECC_TTW1 (0x1d)
#define FSC_SECC_TTW2 (0x1e)
#define FSC_SECC_TTW3 (0x1f)
/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
#define HPFAR_MASK (~UL(0xf)) #define HPFAR_MASK (~UL(0xf))
/* /*
......
...@@ -349,16 +349,16 @@ static __always_inline u8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu *v ...@@ -349,16 +349,16 @@ static __always_inline u8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu *v
static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu) static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu)
{ {
switch (kvm_vcpu_trap_get_fault(vcpu)) { switch (kvm_vcpu_trap_get_fault(vcpu)) {
case FSC_SEA: case ESR_ELx_FSC_EXTABT:
case FSC_SEA_TTW0: case ESR_ELx_FSC_SEA_TTW0:
case FSC_SEA_TTW1: case ESR_ELx_FSC_SEA_TTW1:
case FSC_SEA_TTW2: case ESR_ELx_FSC_SEA_TTW2:
case FSC_SEA_TTW3: case ESR_ELx_FSC_SEA_TTW3:
case FSC_SECC: case ESR_ELx_FSC_SECC:
case FSC_SECC_TTW0: case ESR_ELx_FSC_SECC_TTW0:
case FSC_SECC_TTW1: case ESR_ELx_FSC_SECC_TTW1:
case FSC_SECC_TTW2: case ESR_ELx_FSC_SECC_TTW2:
case FSC_SECC_TTW3: case ESR_ELx_FSC_SECC_TTW3:
return true; return true;
default: default:
return false; return false;
...@@ -373,8 +373,26 @@ static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) ...@@ -373,8 +373,26 @@ static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu)
static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
{ {
if (kvm_vcpu_abt_iss1tw(vcpu)) if (kvm_vcpu_abt_iss1tw(vcpu)) {
return true; /*
* Only a permission fault on a S1PTW should be
* considered as a write. Otherwise, page tables baked
* in a read-only memslot will result in an exception
* being delivered in the guest.
*
* The drawback is that we end-up faulting twice if the
* guest is using any of HW AF/DB: a translation fault
* to map the page containing the PT (read only at
* first), then a permission fault to allow the flags
* to be set.
*/
switch (kvm_vcpu_trap_get_fault_type(vcpu)) {
case ESR_ELx_FSC_PERM:
return true;
default:
return false;
}
}
if (kvm_vcpu_trap_is_iabt(vcpu)) if (kvm_vcpu_trap_is_iabt(vcpu))
return false; return false;
......
...@@ -60,7 +60,7 @@ static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) ...@@ -60,7 +60,7 @@ static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault)
*/ */
if (!(esr & ESR_ELx_S1PTW) && if (!(esr & ESR_ELx_S1PTW) &&
(cpus_have_final_cap(ARM64_WORKAROUND_834220) || (cpus_have_final_cap(ARM64_WORKAROUND_834220) ||
(esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM)) {
if (!__translate_far_to_hpfar(far, &hpfar)) if (!__translate_far_to_hpfar(far, &hpfar))
return false; return false;
} else { } else {
......
...@@ -367,7 +367,7 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) ...@@ -367,7 +367,7 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { if (static_branch_unlikely(&vgic_v2_cpuif_trap)) {
bool valid; bool valid;
valid = kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && valid = kvm_vcpu_trap_get_fault_type(vcpu) == ESR_ELx_FSC_FAULT &&
kvm_vcpu_dabt_isvalid(vcpu) && kvm_vcpu_dabt_isvalid(vcpu) &&
!kvm_vcpu_abt_issea(vcpu) && !kvm_vcpu_abt_issea(vcpu) &&
!kvm_vcpu_abt_iss1tw(vcpu); !kvm_vcpu_abt_iss1tw(vcpu);
......
...@@ -1212,7 +1212,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1212,7 +1212,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
VM_BUG_ON(write_fault && exec_fault); VM_BUG_ON(write_fault && exec_fault);
if (fault_status == FSC_PERM && !write_fault && !exec_fault) { if (fault_status == ESR_ELx_FSC_PERM && !write_fault && !exec_fault) {
kvm_err("Unexpected L2 read permission error\n"); kvm_err("Unexpected L2 read permission error\n");
return -EFAULT; return -EFAULT;
} }
...@@ -1277,7 +1277,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1277,7 +1277,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* only exception to this is when dirty logging is enabled at runtime * only exception to this is when dirty logging is enabled at runtime
* and a write fault needs to collapse a block entry into a table. * and a write fault needs to collapse a block entry into a table.
*/ */
if (fault_status != FSC_PERM || (logging_active && write_fault)) { if (fault_status != ESR_ELx_FSC_PERM ||
(logging_active && write_fault)) {
ret = kvm_mmu_topup_memory_cache(memcache, ret = kvm_mmu_topup_memory_cache(memcache,
kvm_mmu_cache_min_pages(kvm)); kvm_mmu_cache_min_pages(kvm));
if (ret) if (ret)
...@@ -1342,7 +1343,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1342,7 +1343,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* backed by a THP and thus use block mapping if possible. * backed by a THP and thus use block mapping if possible.
*/ */
if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE) if (fault_status == ESR_ELx_FSC_PERM &&
fault_granule > PAGE_SIZE)
vma_pagesize = fault_granule; vma_pagesize = fault_granule;
else else
vma_pagesize = transparent_hugepage_adjust(kvm, memslot, vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
...@@ -1350,7 +1352,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1350,7 +1352,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
&fault_ipa); &fault_ipa);
} }
if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) {
/* Check the VMM hasn't introduced a new disallowed VMA */ /* Check the VMM hasn't introduced a new disallowed VMA */
if (kvm_vma_mte_allowed(vma)) { if (kvm_vma_mte_allowed(vma)) {
sanitise_mte_tags(kvm, pfn, vma_pagesize); sanitise_mte_tags(kvm, pfn, vma_pagesize);
...@@ -1376,7 +1378,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1376,7 +1378,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* permissions only if vma_pagesize equals fault_granule. Otherwise, * permissions only if vma_pagesize equals fault_granule. Otherwise,
* kvm_pgtable_stage2_map() should be called to change block size. * kvm_pgtable_stage2_map() should be called to change block size.
*/ */
if (fault_status == FSC_PERM && vma_pagesize == fault_granule) if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule)
ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
else else
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
...@@ -1441,7 +1443,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) ...@@ -1441,7 +1443,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
is_iabt = kvm_vcpu_trap_is_iabt(vcpu); is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
if (fault_status == FSC_FAULT) { if (fault_status == ESR_ELx_FSC_FAULT) {
/* Beyond sanitised PARange (which is the IPA limit) */ /* Beyond sanitised PARange (which is the IPA limit) */
if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
kvm_inject_size_fault(vcpu); kvm_inject_size_fault(vcpu);
...@@ -1476,8 +1478,9 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) ...@@ -1476,8 +1478,9 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
kvm_vcpu_get_hfar(vcpu), fault_ipa); kvm_vcpu_get_hfar(vcpu), fault_ipa);
/* Check the stage-2 fault is trans. fault or write fault */ /* Check the stage-2 fault is trans. fault or write fault */
if (fault_status != FSC_FAULT && fault_status != FSC_PERM && if (fault_status != ESR_ELx_FSC_FAULT &&
fault_status != FSC_ACCESS) { fault_status != ESR_ELx_FSC_PERM &&
fault_status != ESR_ELx_FSC_ACCESS) {
kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
kvm_vcpu_trap_get_class(vcpu), kvm_vcpu_trap_get_class(vcpu),
(unsigned long)kvm_vcpu_trap_get_fault(vcpu), (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
...@@ -1539,7 +1542,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) ...@@ -1539,7 +1542,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
/* Userspace should not be able to register out-of-bounds IPAs */ /* Userspace should not be able to register out-of-bounds IPAs */
VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
if (fault_status == FSC_ACCESS) { if (fault_status == ESR_ELx_FSC_ACCESS) {
handle_access_fault(vcpu, fault_ipa); handle_access_fault(vcpu, fault_ipa);
ret = 1; ret = 1;
goto out_unlock; goto out_unlock;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册