提交 4cf302bc 编写于 作者: P Paul Mackerras 提交者: Avi Kivity

KVM: PPC: Allow for read-only pages backing a Book3S HV guest

With this, if a guest does an H_ENTER with a read/write HPTE on a page
which is currently read-only, we make the actual HPTE inserted be a
read-only version of the HPTE.  We now intercept protection faults as
well as HPTE not found faults, and for a protection fault we work out
whether it should be reflected to the guest (e.g. because the guest HPTE
didn't allow write access to usermode) or handled by switching to
kernel context and calling kvmppc_book3s_hv_page_fault, which will then
request write access to the page and update the actual HPTE.
Signed-off-by: NPaul Mackerras <paulus@samba.org>
Signed-off-by: NAlexander Graf <agraf@suse.de>
Signed-off-by: NAvi Kivity <avi@redhat.com>
上级 a355aa54
...@@ -121,6 +121,22 @@ static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize) ...@@ -121,6 +121,22 @@ static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
} }
static inline int hpte_is_writable(unsigned long ptel)
{
unsigned long pp = ptel & (HPTE_R_PP0 | HPTE_R_PP);
return pp != PP_RXRX && pp != PP_RXXX;
}
static inline unsigned long hpte_make_readonly(unsigned long ptel)
{
if ((ptel & HPTE_R_PP0) || (ptel & HPTE_R_PP) == PP_RWXX)
ptel = (ptel & ~HPTE_R_PP) | PP_RXXX;
else
ptel |= PP_RXRX;
return ptel;
}
static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
{ {
unsigned int wimg = ptel & HPTE_R_WIMG; unsigned int wimg = ptel & HPTE_R_WIMG;
...@@ -140,7 +156,7 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) ...@@ -140,7 +156,7 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
* Lock and read a linux PTE. If it's present and writable, atomically * Lock and read a linux PTE. If it's present and writable, atomically
* set dirty and referenced bits and return the PTE, otherwise return 0. * set dirty and referenced bits and return the PTE, otherwise return 0.
*/ */
static inline pte_t kvmppc_read_update_linux_pte(pte_t *p) static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing)
{ {
pte_t pte, tmp; pte_t pte, tmp;
...@@ -158,7 +174,7 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *p) ...@@ -158,7 +174,7 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *p)
if (pte_present(pte)) { if (pte_present(pte)) {
pte = pte_mkyoung(pte); pte = pte_mkyoung(pte);
if (pte_write(pte)) if (writing && pte_write(pte))
pte = pte_mkdirty(pte); pte = pte_mkdirty(pte);
} }
......
...@@ -503,6 +503,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -503,6 +503,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
struct page *page, *pages[1]; struct page *page, *pages[1];
long index, ret, npages; long index, ret, npages;
unsigned long is_io; unsigned long is_io;
unsigned int writing, write_ok;
struct vm_area_struct *vma; struct vm_area_struct *vma;
/* /*
...@@ -553,8 +554,11 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -553,8 +554,11 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
pfn = 0; pfn = 0;
page = NULL; page = NULL;
pte_size = PAGE_SIZE; pte_size = PAGE_SIZE;
writing = (dsisr & DSISR_ISSTORE) != 0;
/* If writing != 0, then the HPTE must allow writing, if we get here */
write_ok = writing;
hva = gfn_to_hva_memslot(memslot, gfn); hva = gfn_to_hva_memslot(memslot, gfn);
npages = get_user_pages_fast(hva, 1, 1, pages); npages = get_user_pages_fast(hva, 1, writing, pages);
if (npages < 1) { if (npages < 1) {
/* Check if it's an I/O mapping */ /* Check if it's an I/O mapping */
down_read(&current->mm->mmap_sem); down_read(&current->mm->mmap_sem);
...@@ -565,6 +569,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -565,6 +569,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
((hva - vma->vm_start) >> PAGE_SHIFT); ((hva - vma->vm_start) >> PAGE_SHIFT);
pte_size = psize; pte_size = psize;
is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
write_ok = vma->vm_flags & VM_WRITE;
} }
up_read(&current->mm->mmap_sem); up_read(&current->mm->mmap_sem);
if (!pfn) if (!pfn)
...@@ -575,6 +580,24 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -575,6 +580,24 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
page = compound_head(page); page = compound_head(page);
pte_size <<= compound_order(page); pte_size <<= compound_order(page);
} }
/* if the guest wants write access, see if that is OK */
if (!writing && hpte_is_writable(r)) {
pte_t *ptep, pte;
/*
* We need to protect against page table destruction
* while looking up and updating the pte.
*/
rcu_read_lock_sched();
ptep = find_linux_pte_or_hugepte(current->mm->pgd,
hva, NULL);
if (ptep && pte_present(*ptep)) {
pte = kvmppc_read_update_linux_pte(ptep, 1);
if (pte_write(pte))
write_ok = 1;
}
rcu_read_unlock_sched();
}
pfn = page_to_pfn(page); pfn = page_to_pfn(page);
} }
...@@ -595,6 +618,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -595,6 +618,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* Set the HPTE to point to pfn */ /* Set the HPTE to point to pfn */
r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT); r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT);
if (hpte_is_writable(r) && !write_ok)
r = hpte_make_readonly(r);
ret = RESUME_GUEST; ret = RESUME_GUEST;
preempt_disable(); preempt_disable();
while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
...@@ -614,14 +639,22 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -614,14 +639,22 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
unlock_rmap(rmap); unlock_rmap(rmap);
goto out_unlock; goto out_unlock;
} }
if (hptep[0] & HPTE_V_VALID) {
/* HPTE was previously valid, so we need to invalidate it */
unlock_rmap(rmap);
hptep[0] |= HPTE_V_ABSENT;
kvmppc_invalidate_hpte(kvm, hptep, index);
} else {
kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
}
hptep[1] = r; hptep[1] = r;
eieio(); eieio();
hptep[0] = hpte[0]; hptep[0] = hpte[0];
asm volatile("ptesync" : : : "memory"); asm volatile("ptesync" : : : "memory");
preempt_enable(); preempt_enable();
if (page) if (page && hpte_is_writable(r))
SetPageDirty(page); SetPageDirty(page);
out_put: out_put:
......
...@@ -120,7 +120,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, ...@@ -120,7 +120,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
} }
static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva,
unsigned long *pte_sizep) int writing, unsigned long *pte_sizep)
{ {
pte_t *ptep; pte_t *ptep;
unsigned long ps = *pte_sizep; unsigned long ps = *pte_sizep;
...@@ -137,7 +137,7 @@ static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, ...@@ -137,7 +137,7 @@ static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva,
return __pte(0); return __pte(0);
if (!pte_present(*ptep)) if (!pte_present(*ptep))
return __pte(0); return __pte(0);
return kvmppc_read_update_linux_pte(ptep); return kvmppc_read_update_linux_pte(ptep, writing);
} }
long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
...@@ -154,12 +154,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, ...@@ -154,12 +154,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long is_io; unsigned long is_io;
unsigned long *rmap; unsigned long *rmap;
pte_t pte; pte_t pte;
unsigned int writing;
unsigned long mmu_seq; unsigned long mmu_seq;
bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING;
psize = hpte_page_size(pteh, ptel); psize = hpte_page_size(pteh, ptel);
if (!psize) if (!psize)
return H_PARAMETER; return H_PARAMETER;
writing = hpte_is_writable(ptel);
pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
/* used later to detect if we might have been invalidated */ /* used later to detect if we might have been invalidated */
...@@ -208,8 +210,11 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, ...@@ -208,8 +210,11 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
/* Look up the Linux PTE for the backing page */ /* Look up the Linux PTE for the backing page */
pte_size = psize; pte_size = psize;
pte = lookup_linux_pte(vcpu, hva, &pte_size); pte = lookup_linux_pte(vcpu, hva, writing, &pte_size);
if (pte_present(pte)) { if (pte_present(pte)) {
if (writing && !pte_write(pte))
/* make the actual HPTE be read-only */
ptel = hpte_make_readonly(ptel);
is_io = hpte_cache_bits(pte_val(pte)); is_io = hpte_cache_bits(pte_val(pte));
pa = pte_pfn(pte) << PAGE_SHIFT; pa = pte_pfn(pte) << PAGE_SHIFT;
} }
...@@ -678,7 +683,9 @@ EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte); ...@@ -678,7 +683,9 @@ EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte);
/* /*
* Called in real mode to check whether an HPTE not found fault * Called in real mode to check whether an HPTE not found fault
* is due to accessing a paged-out page or an emulated MMIO page. * is due to accessing a paged-out page or an emulated MMIO page,
* or if a protection fault is due to accessing a page that the
* guest wanted read/write access to but which we made read-only.
* Returns a possibly modified status (DSISR) value if not * Returns a possibly modified status (DSISR) value if not
* (i.e. pass the interrupt to the guest), * (i.e. pass the interrupt to the guest),
* -1 to pass the fault up to host kernel mode code, -2 to do that * -1 to pass the fault up to host kernel mode code, -2 to do that
...@@ -696,12 +703,17 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, ...@@ -696,12 +703,17 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
struct revmap_entry *rev; struct revmap_entry *rev;
unsigned long pp, key; unsigned long pp, key;
valid = HPTE_V_VALID | HPTE_V_ABSENT; /* For protection fault, expect to find a valid HPTE */
valid = HPTE_V_VALID;
if (status & DSISR_NOHPTE)
valid |= HPTE_V_ABSENT;
index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
if (index < 0) if (index < 0) {
if (status & DSISR_NOHPTE)
return status; /* there really was no HPTE */ return status; /* there really was no HPTE */
return 0; /* for prot fault, HPTE disappeared */
}
hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
v = hpte[0] & ~HPTE_V_HVLOCK; v = hpte[0] & ~HPTE_V_HVLOCK;
r = hpte[1]; r = hpte[1];
...@@ -712,8 +724,8 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, ...@@ -712,8 +724,8 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
asm volatile("lwsync" : : : "memory"); asm volatile("lwsync" : : : "memory");
hpte[0] = v; hpte[0] = v;
/* If the HPTE is valid by now, retry the instruction */ /* For not found, if the HPTE is valid by now, retry the instruction */
if (v & HPTE_V_VALID) if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
return 0; return 0;
/* Check access permissions to the page */ /* Check access permissions to the page */
......
...@@ -1114,8 +1114,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) ...@@ -1114,8 +1114,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
kvmppc_hdsi: kvmppc_hdsi:
mfspr r4, SPRN_HDAR mfspr r4, SPRN_HDAR
mfspr r6, SPRN_HDSISR mfspr r6, SPRN_HDSISR
/* HPTE not found fault? */ /* HPTE not found fault or protection fault? */
andis. r0, r6, DSISR_NOHPTE@h andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
beq 1f /* if not, send it to the guest */ beq 1f /* if not, send it to the guest */
andi. r0, r11, MSR_DR /* data relocation enabled? */ andi. r0, r11, MSR_DR /* data relocation enabled? */
beq 3f beq 3f
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册